In [4]:
%load_ext autoreload 
%autoreload 2
import os
from os.path import expanduser
home = expanduser("~")
os.chdir(os.path.join(home,'Documents','project','agu_data','repo','agu_data'))
tablepath = os.path.join(home,'Documents','project','agu_data','repo','Notebook','Tables')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# AGU 2015: Exploratory analysis

Agu_Data
-------

I recently decided to scrap the AGU website [AGU](https://meetings.agu.org/) to look for
interesting pattern in the abstract. Here is a first analysis.


## Data 

In [5]:
from Data_Utils import *
import pandas as pd

In [16]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def presentation(time):
    time_beg = time.split('-')[0].split(':')[0]
    time_end = time.split('-')[1].split(':')[0]
    if int(time_end)-int(time_beg) == 0:
        return 'Oral'
    else:
        return 'Poster'
    
def clean_session(session):
    return ' '.join([f for f in session.split(' ') if f not in ['I','II','III','IV','V','Posters','(Half Session)']])
    
def extract_info(obj):
    ''' obj is a Paper object '''
    try:
        year = str(obj.date.split(',')[1].split(' ')[-1])
        day = obj.date.split(',')[0].strip()
        date = obj.date.split(',')[1].strip()
        nb_authors = len(obj.authors)
        place = obj.place.split('-')[0].strip()
        room = obj.place.split('-')[-1].strip()
        ref = len(obj.reference)
        section = obj.section.strip().encode('utf-8')
        session = clean_session(obj.session.strip()).encode('utf-8')
        pres = presentation(obj.time)
        title_words = tokenizer.tokenize(repr(obj.title[0]).lower())
        invited = int('invited' in title_words)
        lentitle = len(title_words)-1 # to remove the 'u' from unicode
        return [day,date,nb_authors,place,room,ref,section,session,pres,invited,lentitle,year]
    except:
        return ['']
    
columns = ['day','date','nb_authors','place','room','ref','section','session','pres','invited','lentitle','year']
df = pd.DataFrame(columns = columns)
for year in [2014,2015]:
    data = get_all_data('agu'+str(year))
    df = df.append(pd.DataFrame(map(extract_info,data),columns = columns).dropna())
    df.to_csv(os.path.join(tablepath,'agu_exploratory_tmp.csv'))
    



# Rapid inspection

In [14]:
len(df[df.year == '2015']),len(df[df.year == '2014'])

(22204, 23291)

#  Ok

In [8]:
js = "<script type='text/javascript' src='https://public.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 654px; height: 742px;'><noscript><a href='#'><img alt='ExploratoryAnalysisSection ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ex&#47;ExploratoryAnalysisAGUSection&#47;ExploratoryAnalysisSection&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' width='654' height='742' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='ExploratoryAnalysisAGUSection&#47;ExploratoryAnalysisSection' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ex&#47;ExploratoryAnalysisAGUSection&#47;ExploratoryAnalysisSection&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='showVizHome' value='no' /><param name='showTabs' value='y' /><param name='bootstrapWhenNotified' value='true' /></object></div>"
from IPython.display import HTML
HTML(js)

## Map with all contributors by country 

In [6]:
contribs = get_all_contrib('agu2015')
papers = get_all_data('agu2015')



In [7]:
def section_tag_id(papers):
    sections = set([p.section for p in papers])
    tags = []
    for s in sections:
        i =0
        tag = u''
        while tag == u'':
            tag = [p for p in papers if p.section == s][i].tag
            i+=1
        idx = tag[:2]
        if tag[1] in [str(i) for i in range(10)]:
            idx = tag[0]
        tags.append(idx)
    return dict(zip([str(f).strip() for f in sections],tags))
            
sections_dict = section_tag_id(papers)

In [9]:
def collect_info_authors(papers,contribs):
    ''' Return a dataframe with several info on each authors '''
    
    #Collect country code
    alpha3_code = {clean(f.name).lower():f.alpha3 for f in pycountry.countries}
    
    # Collect first basic info contained in contribs
    df1 = pd.DataFrame([[f.country,f.name,len(f.tag_sections)] for f in contribs],
                      columns = ['country','name','Ncontribs'])
    
    #Collect next how many contribs the authors have in each section
    sections_dict = section_tag_id(papers) # Dict of the section name with their respective tag
    #Make the dataframe, each colum is a section and the corresponding nb of contrib for each authors
    #Index is the same than df.
    df2 = pd.DataFrame([[sum([tag == f for f in contrib.tag_sections]) 
                         for tag in sections_dict.values()] for contrib in contribs],
                       columns = sections_dict.keys())
    df_final = df1.join(df2)
    df_final['mask_test'] = [f.split(' ')[0] == 'Test' for f in df_final.name]
    df_final = df_final[~df_final.mask_test]
    df_final['alpha3_code'] = map(lambda x:alpha3_code[x], df_final.country)
    return df_final

In [10]:
df = collect_info_authors(papers,contribs)
df.head(2)

Unnamed: 0,country,name,Ncontribs,Public Affairs,Union,Tectonophysics,SPA-Aeronomy,Geodesy,Earth and Planetary Surface Processes,Planetary Sciences,...,Atmospheric Sciences,"Volcanology, Geochemistry and Petrology",Earth and Space Science Informatics,SPA-Solar and Heliospheric Physics,Biogeosciences,SPA-Magnetospheric Physics,Nonlinear Geophysics,Education,mask_test,alpha3_code
0,united states,Christopher Sabine,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,False,USA
1,united states,Cinzia Zuffada,2,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,False,USA


In [23]:
group = df.groupby('alpha3_code')
group.Ncontribs.sum().sort_values()

alpha3_code
BLZ        0
VGB        0
COK        0
ERI        0
FSM        0
CYM        0
BHS        0
ASM        0
MYT        0
SYC        0
LSO        0
BDI        0
NIC        0
LCA        1
KHM        1
CAF        1
IRQ        1
GUM        1
GUF        1
KIR        1
LVA        1
AFG        1
MOZ        1
PLW        1
PSE        1
ATA        1
TGO        1
AND        1
UZB        1
MKD        1
       ...  
ISL      176
TUR      192
SGP      233
FIN      313
CHL      378
RUS      401
DNK      404
BEL      419
AUT      436
GEO      448
NOR      587
ESP      591
NZL      636
IND      720
SWE      800
TWN      800
NLD      864
MEX      885
BRA      889
CHE     1120
AUS     1378
ITA     1386
PRK     1591
CAN     2834
CHN     3680
FRA     3876
GBR     3959
JPN     4038
DEU     4113
USA    56041
Name: Ncontribs, dtype: int64

In [36]:
a = sns.color_palette('deep')

In [37]:
a.as_hex()

[u'#4c72b0', u'#55a868', u'#c44e52', u'#8172b2', u'#ccb974', u'#64b5cd']

In [73]:
list_codes = set(df.alpha3_code)
group = df.groupby('alpha3_code')

idxs = map(str,group.Ncontribs.sum().sort_values().index)
colors = map(str,sns.color_palette('Blues',len(list_codes)).as_hex())
colors = [f.upper() for f in colors]
colors = dict(zip(idxs,colors))
colors

{'AFG': '#DDEAF7',
 'AGO': '#C9DDF0',
 'ALB': '#C8DCF0',
 'AND': '#D6E6F4',
 'ARE': '#3989C1',
 'ARG': '#1562A9',
 'ARM': '#69ADD5',
 'ASM': '#EEF5FC',
 'ATA': '#D9E7F5',
 'AUS': '#084082',
 'AUT': '#09529D',
 'BDI': '#E9F2FA',
 'BEL': '#0A539E',
 'BEN': '#5BA3D0',
 'BFA': '#4A98C9',
 'BGD': '#2D7DBB',
 'BGR': '#AFD1E7',
 'BHS': '#EFF6FC',
 'BLZ': '#F6FAFF',
 'BMU': '#B0D2E7',
 'BOL': '#A9CFE5',
 'BRA': '#084387',
 'BRB': '#9DCAE1',
 'BRN': '#8ABFDD',
 'BTN': '#B7D4EA',
 'BWA': '#77B5D9',
 'CAF': '#E4EFF9',
 'CAN': '#083A7A',
 'CHE': '#084184',
 'CHL': '#0E58A2',
 'CHN': '#083979',
 'CIV': '#5FA6D1',
 'CMR': '#BAD6EB',
 'COG': '#72B2D8',
 'COK': '#F4F9FE',
 'COL': '#1865AC',
 'CPV': '#97C6DF',
 'CRI': '#3181BD',
 'CUB': '#CBDEF1',
 'CYM': '#F0F6FD',
 'CYP': '#71B1D7',
 'CZE': '#135FA7',
 'DEU': '#083370',
 'DJI': '#CDDFF1',
 'DMA': '#CDE0F1',
 'DNK': '#0B559F',
 'DZA': '#8DC1DD',
 'ECU': '#1E6DB2',
 'EGY': '#4493C7',
 'ERI': '#F2F8FD',
 'ESP': '#084D96',
 'EST': '#5DA5D1',
 'ETH': '#3D

In [55]:
group.Ncontribs.sum().sort_values().describe()

count      169.000000
mean       573.591716
std       4353.949162
min          0.000000
25%          3.000000
50%          9.000000
75%         67.000000
max      56041.000000
Name: Ncontribs, dtype: float64

In [70]:
def Attribute_key(N):
    if N>20000:
        return 'Hign'
    elif N>10000:
        return 'Medium high'
    elif  N> 1000:
        return 'Medium'
    elif N> 500:
        return 'Low medium'
    else:
        return 'Low'
    

In [74]:
import seaborn as sns

def datamap_json(df):
    json = {}
    list_codes = set(df.alpha3_code)
    group = df.groupby('alpha3_code')
    
    for code in list_codes:
        series =  group.get_group(code).drop(['name','country','mask_test','alpha3_code'],axis =1)
        N = len(series)
        series = series.sum().sort_values(ascending = False)
        json[code] = {'fillKey': code,
                      'Ncontributors': N,
                      'Ncontributions' : series[0],
                      'Maintopic': series.index[1]+ ' : %2.1f %%'%(series[1]/float(series[0])*100),
                      'Secondtopic': series.index[2]+ ' : %2.1f %%'%(series[2]/float(series[0])*100),
                      'Thirdtopic' : series.index[3]+ ' : %2.1f %%'%(series[3]/float(series[0])*100)}
        #json[code] = {'fillKey': Attribute_key(N)}
    return json
        
dat = datamap_json(df)

import json
json_path = os.path.join(home,'Documents','project','agu_data','gist','WorlMapContribuCountry')
name_json = os.path.join(json_path,'data')
with codecs.open(name_json + '.json', 'w+', 'utf8') as outfile:
    json.dump(dat,outfile, sort_keys=True,indent=4,ensure_ascii=False)


In [191]:
list_codes = set(df.alpha3_code)
group = df.groupby('alpha3_code')

In [210]:
series = group.get_group('AFG').drop(['name','country','mask_test','alpha3_code'],axis =1)
#series = series.sum().sort_values(ascending = False)

In [212]:
len(series)

1

In [208]:
dat['USA']

{'Main topic': 'Atmospheric Sciences : 15.6 %',
 'N contributions': 56041,
 'Second topic': 'Hydrology : 11.1 %',
 'Third topic': 'Biogeosciences : 10.2 %',
 'fillKey': 'Potential'}

In [115]:
group = df.groupby('alpha3_code')

In [118]:
group.Ncontribs.sum()

alpha3_code
AFG        1
AGO        2
ALB        2
AND        1
ARE       30
ARG      129
ARM        9
ASM        0
ATA        1
AUS     1378
AUT      436
BDI        0
BEL      419
BEN       12
BFA       18
BGD       52
BGR        4
BHS        0
BLZ        0
BMU        4
BOL        4
BRA      889
BRB        5
BRN        6
BTN        3
BWA        7
CAF        1
CAN     2834
CHE     1120
CHL      378
       ...  
SLB        4
SLV        4
SRB        7
SVK        5
SVN        9
SWE      800
SWZ        2
SYC        0
SYR        2
TCD        3
TGO        1
THA       49
TJK        4
TTO       16
TUN       10
TUR      192
TWN      800
TZA       40
UGA        2
UKR       19
URY       10
USA    56041
UZB        1
VEN        6
VGB        0
VNM       27
VUT        5
ZAF       96
ZMB        3
ZWE        2
Name: Ncontribs, dtype: int64

In [None]:
import pycountry
def clean(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
    text = text.replace('\n', ' ')
    return text

countries = {}
for country in pycountry.countries:
    countries[country.name.lower()] = country.alpha3
countries['taiwan'] = 'TWN'

example = [[f.name,f.address,f.country] for f  in contribs[:50] if f.country != '']
df = pd.DataFrame(example,columns = ['name','address','country'],index = range(len(example)))
df['Name'] = df.name
df['codes'] = [countries[f] for f in df.country]

colors = dict(zip(countries.values(),sns.color_palette('deep',len(countries.values())).as_hex()))

def datamap_json(df):
    json = {}
    list_codes = set(df.codes)
    
    for code in list_codes:
        colab = df[df.codes == code]
        names = ['<br/> ' + clean(row.Name) + ' from ' + clean(row.address.split(',')[0]) 
                 for i,row in colab.iterrows()]
        json[code] = {'fillKey': 'Potential',
                      'Number of potential collaborators' : str(len(names)),
                      'Names': ''.join(names)}
    return json
        
dat = datamap_json(df)

import json
name_json = os.path.join('/Users/thorey/Documents/repos/agu_data/Notebook','test')
with codecs.open(name_json + '.json', 'w+', 'utf8') as outfile:
    json.dump(dat,outfile, sort_keys=True,indent=4,ensure_ascii=False)


In [86]:
alpha3_code = {clean(f.name).lower():f.alpha3 for f in pycountry.countries}

In [371]:
import pycountry

In [368]:
COUNTRY

Unnamed: 0,id,value
0,AF,afghanistan
1,AX,Åland islands
2,AL,albania
3,DZ,algeria
4,AS,american samoa
5,AD,andorra
6,AO,angola
7,AI,anguilla
8,AQ,antarctica
9,AG,antigua & barbuda
