In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [None]:
%load_ext autoreload 
%autoreload 2
import os
os.chdir('../')
from geocolab.Data_Utils import *
import pandas as pd
import seaborn as sns

import matplotlib.pylab as plt
%matplotlib inline

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF 
plotly.offline.init_notebook_mode()  

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
import unicodedata

def tag_to_section(obj):
    try:
        tag_section = obj.tag[:2]
        if tag_section[1] in [str(i) for i in range(10)]:
            tag_section = obj.tag[0]
    except:
        tag_section = ''
    if tag_section.isupper():
        return tag_section
    else:
        return ''
    
    
def presentation(time):
    time_beg = time.split('-')[0].split(':')[0]
    time_end = time.split('-')[1].split(':')[0]
    if int(time_end)-int(time_beg) == 0:
        return 'Oral'
    else:
        return 'Poster'

def first_clean_title(text):
    if text.split(' ')[-1] == '(Invited)':
        text = ' '.join(text.split(' ')[:-1])
    return text
def clean(text):
    try:
        text = unicodedata.normalize('NFKD', text).encode('ascii','ignore')
        text = text.replace('\n', ' ')
    except:
        pass
    return text


def clean_session(session):
    return ' '.join([f for f in session.split(' ') if f not in ['I','II','III','IV','V','Posters','(Half Session)']])
    
def extract_info(obj):
    ''' obj is a Paper object '''
    try:
        tag = tag_to_section(obj)
        year = str(obj.date.split(',')[1].split(' ')[-1])
        day = obj.date.split(',')[0].strip()
        date = obj.date.split(',')[1].strip()
        nb_authors = len(obj.authors)
        place = obj.place.split('-')[0].strip()
        room = obj.place.split('-')[-1].strip()
        ref = len(obj.reference)
        section = obj.section.strip().encode('utf-8')
        session = clean_session(obj.session.strip()).encode('utf-8')
        pres = presentation(obj.time)
        title_words = tokenizer.tokenize(repr(obj.title[0]).lower())
        invited = int('invited' in title_words)
        lentitle = len(title_words)-1 # to remove the 'u' from unicode
        title = clean(first_clean_title(' '.join(obj.title)))
        
        return [day,date,nb_authors,place,room,ref,section,session,pres,invited,lentitle,year,title,tag]
    except:
        return ['']
    
columns = ['day','date','nb_authors','place','room','ref','section','session','pres','invited','lentitle','year','title','tag']
df = pd.DataFrame(columns = columns)
for year in [2014,2015]:
    data = get_all_data(year)
    df = df.append(pd.DataFrame(map(extract_info,data),columns = columns).dropna())
    

In [None]:
year2014 = df.groupby('year').get_group('2014')
year2015 = df.groupby('year').get_group('2015')

American  Geophysical Union  (AGU) meeting  is a  geocsience conference
hold  each year  around Christmas  in San  Francisco. It  represents a
great opportunity for PhD students like  me to show off their work and
enjoy what the west coast has to offer.

![AGU logo](https://meetings.agu.org/meetings/files/2014/04/fm300x200.jpg)

However, with  nearly 24 000 attendees,  AGU Fall Meeting is  also the
largest Earth  and space  science meeting  in the  world. As  such, it
represents an interesting dataset to dive into the geoscience academic world.

For a new attendee like me two years ago, the amount of information available is clearly overwhelming. For instance, nearly 3500 poster and 1000 oral presentations are scheduled for only the first day of the conference. As each oral presentation is offered a 15 min time slot, this represents 250 hours of talks and assuming 15 slides by presentations, nearly 15000 slides of geoscience stuff, i.e. a lot to process for a human being ;) 

In [None]:
data = year2015.groupby(['day','pres']).pres.count()
days = list(set(year2015.day))
days = ['Monday','Tuesday','Wednesday','Thursday','Friday']
val = [data.loc[f,'Oral'] for f in days]
trace0 = go.Bar(
    x=days,
    y=val,
    name='Oral',
    marker=dict(
        color='rgb(49,130,189)'
    )
)
val = [data.loc[f,'Poster'] for f in days]
trace1 = go.Bar(
    x=days,
    y=val,
    name='Poster',
    marker=dict(
        color='rgb(204,204,204)',
    )
)
data = [trace0, trace1]
layout = go.Layout(
    height = 500,
    titlefont = {'size':18},
    margin = {'b':125,'r':25},
    title='Nb of oral/poster presentation by day',
    yaxis = dict(title='Nb of presentations',
                titlefont = {'size':18},
                tickfont = {'size':18}),
    xaxis=dict(
        # set x-axis' labels direction at 45 degree angle
        tickangle=-45,
        tickfont = {'size':16}
    ),
    barmode='group',
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, show_link=False)
#py.iplot(fig,filename = 'AGUDayPres')


Nevertheless, the organization of the conference makes it easy to focus on your specific field of interest. Indeed, the conference is organized by sessions, themselves organized into 27 different sections which cover almost everthing possible geoscience subject, from the fluid dynamics of the inner core in the Earth deep interior section to the formation of terrestrial planets if the planetary science one. 

Sessions are proposed by convenier and are scheduled in the conference if they manage to gather a sufficient number of abstracts. 
If a session does so, it is granted with a 2 hours time slot for an oral session and a 4 hours slot for poster presentation. The decision of wether each contribution goes in the oral or the poster slot is taken by the session convieners. In particular, an oral awarded contribution will get a 15 min slot in the oral presentation to convince other fellows of their good work.
 
The number of sessions by section can first be used as a proxy for the populatiry of each field. For instance, this year, atmospheric processes, hydrology and biogeoscences host the most sessions during the conference.

In [None]:
trace0 = go.Pie(
    labels = year2015.groupby('section').apply(lambda x: len(x.session.unique())).index.tolist(),
    values = year2015.groupby('section').apply(lambda x: len(x.session.unique())).tolist(),
    text = year2015.groupby('section').apply(lambda x: len(x.session.unique())).index.tolist(),
    marker = {'colors':sns.color_palette('deep',len(set(year2015.section)))},
    name = '2015',
    textfont = {'size':12},
    hoverinfo = "label+value",
    showlegend = False,
    textposition = "inside",
    domain = {'x':[0,1.0]})
data = [trace0]

layout = go.Layout(
    height = 800,
    titlefont = {'size':18},
    margin = {'b':0,'r':2,'l':2},
    title='Nb of sessions by section in 2015',
    legend = {'yanchor':'auto',
              'x':.85,
             'font':{'size':15}})

fig = go.Figure(data=data,layout=layout)
#py.iplot(fig,filename = 'NbSessionSection')
plotly.offline.iplot(fig, show_link=False)

However, the number of contributions by sections may give us a better measure for the popularity of each field. Note that these numbers, which sum to nearly 100000, are not representative of the size of the geoscience communitee as one authors can contributes to many contributions. Indeed, in average, the average number of authors by contribution is close to 4 which, assuming that everybody comes, give about 25000 attendees. 

In [None]:
year2015.groupby('section').nb_authors.mean()

In [None]:
trace0 = go.Bar(
    x=year2015.groupby('section').nb_authors.sum().sort_values(ascending=False).index.tolist(),
    y=year2015.groupby('section').nb_authors.sum().sort_values(ascending=False).tolist(),
    name='2015',
    marker=dict(
        color='rgb(49,130,189)'
    )
)
trace1 = go.Bar(
    x=year2014.groupby('section').nb_authors.sum().index.tolist(),
    y=year2014.groupby('section').nb_authors.sum().tolist(),
    name='2014',
    marker=dict(
        color='rgb(204,204,204)',
    )
)
data = [trace0, trace1]
layout = go.Layout(
    height = 750,
    titlefont = {'size':18},
    margin = {'b':250,'r':100},
    title='Nb of contributors by section',
    yaxis = dict(title='Nb authors',
                titlefont = {'size':18},
                tickfont = {'size':18}),
    xaxis=dict(
        # set x-axis' labels direction at 45 degree angle
        tickangle=-45,
        tickfont = {'size':14}
    ),
    barmode='group',
)
fig = go.Figure(data=data, layout=layout)
#py.iplot(fig,filename = 'NbContribsSection')
plotly.offline.iplot(fig, show_link=False)


Atmospheric science is the most trendy topics in the geoscience academic world for the past two years. It is closely followed by Hydrology and Biogeosciences. Together, these three thematics count for nearly 35% of all AGU contributions in 2015 and clearly dominates, by far, the rest of the conference interest. It would be interesting to see if the research thematic funding distribution in geoscience follows the same distribution. I suppose that should be the case though. 

In [None]:
N = year2015.groupby('section').nb_authors.sum().sum()
year2015.groupby('section').nb_authors.sum().sort_values(ascending=False)[:3].sum()/float(N)

Also worth noticing, Global Environmental Change is also part of the top 5. Given the increasing importance of this thematic throughout our society, this is actually not suprising. Taking a closer look, we can see that an important part of the research investigations in this field focus of renewable enery (4.7% of all the contributions) and carbon sequestration-related problem (about 7%). Global warming and climate-related change are also popular in this section as expected. 


In [None]:
GE2015 = year2015.groupby('section').get_group('Global Environmental Change')
index = GE2015.groupby('session').nb_authors.sum().sort_values(ascending=False).index.tolist()
values = GE2015.groupby('session').nb_authors.sum().sort_values(ascending=False).tolist()
# Keep only first 10 and put everything in others
topn =40
index = index[:topn]+['Others']
values = values[:topn]+[sum(values[topn:])]

trace0 = go.Pie(
    labels = index,
    values = values,
    text = index,
    marker = {'colors':sns.color_palette('deep',12)},
    name = '2015',
    textfont = {'size':12},
    hoverinfo = "label+value",
    showlegend = False,
    textposition = "inside",
    domain = {'x':[0,1.0]})
data = [trace0]

layout = go.Layout(
    height = 750,
    titlefont = {'size':18},
    margin = {'b':50,'r':50,'l':50,'t':50},
    legend = {'yanchor':'auto',
             'font':{'size':9},
             'x':0.75},
    title='Nb of contributions by session in the Global Environmental Change section (2015)')


fig = go.Figure(data=data,layout=layout)
#py.iplot(fig,filename = 'NbContribsGEC')
plotly.offline.iplot(fig, show_link=False)

2015 has also been a year of many exiting results in planetary science (which is closely related to my PhD subject by the way) with the release of very interesting data from many different spacecraft. This is again nicely reflected in the conference. The three first ranked session are indeed dealing with important planetary science mission which gaves results this year

- The MAVEN mission from NASA which investigates the martian atmosphere
- The Rosetta mission from ESA which, after 10 years of inactivity, wakes up and manage its rendezvous with the comete Chury (Churyumov-Gerasimenko).
- The Dawn mission from NASA which investigates two of the three known protoplanets of the asteroid belt, Vesta and Ceres.


In [None]:
PS2015 = year2015.groupby('section').get_group('Planetary Sciences')
index = PS2015.groupby('session').nb_authors.sum().sort_values(ascending=False).index.tolist()
values = PS2015.groupby('session').nb_authors.sum().sort_values(ascending=False).tolist()
# Keep only first 10 and put everything in others
topn =31
index = index[:topn]+['Others']
values = values[:topn]+[sum(values[topn:])]

trace0 = go.Pie(
    labels = index,
    text = index,
    values = values,
    marker = {'colors':sns.color_palette('hls',len(index)).as_hex()},
    name = '2015',
    textfont = {'size':12},
    hoverinfo = "label+value",
    showlegend = False,
    textposition = "inside",
    domain = {'x':[0,1.0]})
data = [trace0]

layout = go.Layout(
    height = 750,
    titlefont = {'size':18},
    margin = {'b':50,'r':50,'l':50,'t':50},
    legend = {'yanchor':'auto',
             'font':{'size':9},
             'x':0.85},
    title='Nb of contributions by session in the Planetary Sciences section (2015)')


fig = go.Figure(data=data,layout=layout)
#py.iplot(fig,filename = 'NbContribsPS')
plotly.offline.iplot(fig, show_link=False)

# Distribution by country 

In [None]:
section_to_tag_dict = {row.tag:row.section for i,row in year2015.iterrows()}
def section_to_tag(key):
    return section_to_tag_dict[key]

In [None]:
columns = ['name','country','nb_contrib','year']
dfc = pd.DataFrame(columns = columns)
for year in [2014,2015]:
    contribs = get_all_contrib(str(year))
    dfc = dfc.append(pd.DataFrame([[obj.name,obj.country,len(obj.papers),year] for obj in contribs],columns = columns))

In [None]:
year2015c = dfc.groupby('year').get_group(2015)
year2014c = dfc.groupby('year').get_group(2014)

In [None]:
country = year2015c.groupby('country').nb_contrib.sum().sort_values(ascending = False)
trace0 = go.Bar(
    x=country.index.tolist(),
    y=country.tolist(),
    name='2015',
    marker=dict(
        color=sns.color_palette('deep').as_hex()[0]
    )
)
country = year2014c.groupby('country').nb_contrib.sum().sort_values(ascending = False)
trace1 = go.Bar(
    x=country.index.tolist(),
    y=country.tolist(),
    name='2014',
    marker=dict(
        color=sns.color_palette('deep').as_hex()[2]
    )
)
data = [trace0,trace1]
layout = go.Layout(
    legend = {'xanchor':'center',
              'x':.5},
    height = 750,
    titlefont = {'size':18},
    margin = {'b':250,'r':100},
    title='Nb of contributors by country',
    yaxis = dict(title='Nb of contributors',
                titlefont = {'size':18},
                tickfont = {'size':18},
                range = [0,5000]),
    xaxis=dict(
        # set x-axis' labels direction at 45 degree angle
        tickangle=-45,
        tickfont = {'size':14},
        range = [-1,20]
    ),
    barmode='group',
)
fig = go.Figure(data=data, layout=layout)
#py.iplot(fig,filename = 'NbContribsCounry')
plotly.offline.iplot(fig, show_link=False)
