In [15]:
%load_ext autoreload 
%autoreload 2
import os
from os.path import expanduser
home = expanduser("~")
os.chdir(os.path.join(home,'Documents','project','agu_data','repo','agu_data'))
tablepath = os.path.join(home,'Documents','project','agu_data','repo','Notebook','Tables')

# AGU 2015: Exploratory analysis

Agu_Data
-------

I recently decided to scrap the AGU website [AGU](https://meetings.agu.org/) to look for
interesting pattern in the abstract. Here is a first analysis.


## Data 

In [16]:
from Data_Utils import *
import pandas as pd

In [16]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def presentation(time):
    time_beg = time.split('-')[0].split(':')[0]
    time_end = time.split('-')[1].split(':')[0]
    if int(time_end)-int(time_beg) == 0:
        return 'Oral'
    else:
        return 'Poster'
    
def clean_session(session):
    return ' '.join([f for f in session.split(' ') if f not in ['I','II','III','IV','V','Posters','(Half Session)']])
    
def extract_info(obj):
    ''' obj is a Paper object '''
    try:
        year = str(obj.date.split(',')[1].split(' ')[-1])
        day = obj.date.split(',')[0].strip()
        date = obj.date.split(',')[1].strip()
        nb_authors = len(obj.authors)
        place = obj.place.split('-')[0].strip()
        room = obj.place.split('-')[-1].strip()
        ref = len(obj.reference)
        section = obj.section.strip().encode('utf-8')
        session = clean_session(obj.session.strip()).encode('utf-8')
        pres = presentation(obj.time)
        title_words = tokenizer.tokenize(repr(obj.title[0]).lower())
        invited = int('invited' in title_words)
        lentitle = len(title_words)-1 # to remove the 'u' from unicode
        return [day,date,nb_authors,place,room,ref,section,session,pres,invited,lentitle,year]
    except:
        return ['']
    
columns = ['day','date','nb_authors','place','room','ref','section','session','pres','invited','lentitle','year']
df = pd.DataFrame(columns = columns)
for year in [2014,2015]:
    data = get_all_data('agu'+str(year))
    df = df.append(pd.DataFrame(map(extract_info,data),columns = columns).dropna())
    df.to_csv(os.path.join(tablepath,'agu_exploratory_tmp.csv'))
    



# Rapid inspection

In [14]:
len(df[df.year == '2015']),len(df[df.year == '2014'])

(22204, 23291)

#  Ok

In [8]:
js = "<script type='text/javascript' src='https://public.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 654px; height: 742px;'><noscript><a href='#'><img alt='ExploratoryAnalysisSection ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ex&#47;ExploratoryAnalysisAGUSection&#47;ExploratoryAnalysisSection&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' width='654' height='742' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='ExploratoryAnalysisAGUSection&#47;ExploratoryAnalysisSection' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ex&#47;ExploratoryAnalysisAGUSection&#47;ExploratoryAnalysisSection&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='showVizHome' value='no' /><param name='showTabs' value='y' /><param name='bootstrapWhenNotified' value='true' /></object></div>"
from IPython.display import HTML
HTML(js)

## Map with all contributors by country 

In [17]:
contribs = get_all_contrib('agu2015')
papers = get_all_data('agu2015')



In [18]:
def section_tag_id(papers):
    sections = set([p.section for p in papers])
    tags = []
    for s in sections:
        i =0
        tag = u''
        while tag == u'':
            tag = [p for p in papers if p.section == s][i].tag
            i+=1
        idx = tag[:2]
        if tag[1] in [str(i) for i in range(10)]:
            idx = tag[0]
        tags.append(idx)
    return dict(zip([str(f).strip() for f in sections],tags))
            
sections_dict = section_tag_id(papers)

In [45]:
def collect_info_authors(papers,contribs):
    ''' Return a dataframe with several info on each authors '''
    
    #Collect country code
    alpha3_code = {clean(f.name).lower():f.alpha3 for f in pycountry.countries}
    
    # Collect first basic info contained in contribs
    df1 = pd.DataFrame([[f.country,f.name,len(f.tag_sections)] for f in contribs],
                      columns = ['country','name','Ncontribs'])
    
    #Collect next how many contribs the authors have in each section
    sections_dict = section_tag_id(papers) # Dict of the section name with their respective tag
    #Make the dataframe, each colum is a section and the corresponding nb of contrib for each authors
    #Index is the same than df.
    df2 = pd.DataFrame([[sum([tag == f for f in contrib.tag_sections]) 
                         for tag in sections_dict.values()] for contrib in contribs],
                       columns = sections_dict.keys())
    df_final = df1.join(df2)
    df_final['mask_test'] = [f.split(' ')[0] == 'Test' for f in df_final.name]
    df_final = df_final[~df_final.mask_test]
    df_final['alpha3_code'] = map(lambda x:alpha3_code[x], df_final.country)
    return df_final

In [46]:
df = collect_info_authors(papers,contribs)


In [22]:
list_codes = set(df.alpha3_code)
group = df.groupby('alpha3_code')

idxs = map(str,group.Ncontribs.sum().sort_values().index)
colors = map(str,sns.color_palette('Reds',len(list_codes)).as_hex())
colors = [f.upper() for f in colors]
colors = dict(zip(idxs,colors))

In [23]:
group.Ncontribs.sum().sort_values().describe()

count      169.000000
mean       573.591716
std       4353.949162
min          0.000000
25%          3.000000
50%          9.000000
75%         67.000000
max      56041.000000
Name: Ncontribs, dtype: float64

In [25]:
import seaborn as sns

def datamap_json(df):
    json = {}
    list_codes = set(df.alpha3_code)
    group = df.groupby('alpha3_code')
    
    for code in list_codes:
        series =  group.get_group(code).drop(['name','country','mask_test','alpha3_code'],axis =1)
        N = len(series)
        series = series.sum().sort_values(ascending = False)
        json[code] = {'fillKey': code,
                      'Ncontributors': N,
                      'Ncontributions' : series[0],
                      'Maintopic': series.index[1]+ ' : %2.1f %%'%(series[1]/float(series[0])*100),
                      'Secondtopic': series.index[2]+ ' : %2.1f %%'%(series[2]/float(series[0])*100),
                      'Thirdtopic' : series.index[3]+ ' : %2.1f %%'%(series[3]/float(series[0])*100)}
        #json[code] = {'fillKey': Attribute_key(N)}
    return json
        
dat = datamap_json(df)

import json
json_path = os.path.join(home,'Documents','project','agu_data','gist','WorlMapContribuCountry')
name_json = os.path.join(json_path,'data')
with codecs.open(name_json + '.json', 'w+', 'utf8') as outfile:
    json.dump(dat,outfile, sort_keys=True,indent=4,ensure_ascii=False)


In [49]:
import seaborn as sns

def datamap_py(df):
    json = {}
    list_codes = set(df.alpha3_code)
    group = df.groupby('alpha3_code')
    
    row = []
    for code in list_codes:
        series =  group.get_group(code).drop(['name','country','mask_test','alpha3_code'],axis =1)
        N = len(series)
        series = series.sum().sort_values(ascending = False)
        Ncontributions = series[0]
        text = '<br/>Nb of contributors :' + str(N) + ' ' \
               '<br/>Nb of contributions :' + str(Ncontributions)
        row.append([code, N,text])
    df_new = pd.DataFrame(row,columns =['code',
                                        'Nb contributors',
                                        'text'])
    return df_new

df2 = datamap_py(df)
df2.head()

Unnamed: 0,code,Nb contributors,text
0,DZA,4,<br/>Nb of contributors :4 <br/>Nb of contribu...
1,QAT,14,<br/>Nb of contributors :14 <br/>Nb of contrib...
2,EGY,20,<br/>Nb of contributors :20 <br/>Nb of contrib...
3,BGD,46,<br/>Nb of contributors :46 <br/>Nb of contrib...
4,NAM,10,<br/>Nb of contributors :10 <br/>Nb of contrib...


In [51]:
# Learn about API authentication here: https://plot.ly/python/getting-started
# Find your api_key here: https://plot.ly/settings/api

import plotly.plotly as py
import pandas as pd

data = [ dict(
        type = 'choropleth',
        locations = df2['code'],
        z = df2['Nb contributors'],
        text = df2['text'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            )
        ),
        colorbar = dict(
            tickprefix = '',
            title = 'Nb contributors'
        ),
    ) ]

layout = dict(
    title = '',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
py.iplot(fig, filename = 'basic-line')

In [7]:
url = py.plot( fig, validate=False, filename='d3-world-map' )


High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~clement.thorey/0 or inside your plot.ly account where it is named 'd3-world-map'


In [5]:
py.sign_in('clement.thorey', 'qhf9oi0fjs')

In [8]:
import plotly.plotly as py
from plotly.graph_objs import *

trace0 = Scatter(
  x=[1, 2, 3, 4],
  y=[10, 15, 13, 17]
)
trace1 = Scatter(
  x=[1, 2, 3, 4],
  y=[16, 5, 11, 9]
)
data = Data([trace0, trace1])

py.iplot(data, filename = 'basic-line')
