## Different imports and setup

In [1]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
from datetime import date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import bz2
import json
import re
from tqdm.notebook import tqdm

from sklearn import linear_model

In [2]:
from functions import *
import plotly.graph_objects as go
import plotly.express as px

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/cyrilvallez/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrilvallez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cyrilvallez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load the data

In [3]:
quotes = pd.read_json('Quotebank_climate_attributes/quotes-all-years.json.bz2', compression='bz2',
 lines=True, convert_dates=False)


# Remove time from the date
a = quotes.date.copy()
for i in range(len(quotes)):
    a[i] = a[i].split()[0]

quotes['date'] = a

## Creates the count dataset

In [4]:
count = quotes['date'].value_counts()
count = pd.DataFrame({'date': count.index, 'count': count.values})

In [5]:
# Load the frequency of quotes for all days in quotebank
f = open('Quote_frequency_day.json')
days_freq = json.load(f)

In [6]:
dates = []
limit = 5000

for key in days_freq.keys():
    if (days_freq[key] > limit):
        dates.append(key)

In [7]:
count = count[count['date'].isin(dates)]
quotes = quotes[quotes['date'].isin(dates)]

In [8]:
# Add the total count of quotes for days in our dataset
count['total_count'] = count['date'].map(days_freq)
quotes['weights'] = 1/quotes['date'].map(days_freq) # To later on plot the weighted histograms

In [9]:
# Add the frequency to the dataset
count['frequency'] = count['count']/count['total_count']

In [10]:
# Transform the dates to datetime objects
count['date'] = pd.to_datetime(count['date'])
quotes['date'] = pd.to_datetime(quotes['date'])

In [11]:
# Add the number of days from each date from the beginning
count['time_delta'] = (count['date'] - count['date'].min())/timedelta(days=1)

In [12]:
# Add year, month and day for each date
count['year'] = count['date'].dt.year
count['month'] = count['date'].dt.month
count['day'] = count['date'].dt.day

quotes['year'] = quotes['date'].dt.year
quotes['month'] = quotes['date'].dt.month

In [13]:
# Sort according to the date
count.sort_values(by='date', inplace=True, ignore_index=True)

In [14]:
count['freq_per_thousand'] = 1000*count['frequency']
count['month/year'] = count['date'].dt.strftime('%B %Y')

In [15]:
regr = linear_model.LinearRegression()
regr.fit(np.expand_dims(count.date, axis=-1), np.expand_dims(count.freq_per_thousand, axis=-1))

fit = regr.predict(np.expand_dims(count.date.values.astype(float), axis=-1))
count['fit_freq'] = np.ravel(fit)

In [16]:
count.date

0      2015-01-01
1      2015-01-02
2      2015-01-03
3      2015-01-04
4      2015-01-05
          ...    
1693   2020-04-12
1694   2020-04-13
1695   2020-04-14
1696   2020-04-15
1697   2020-04-16
Name: date, Length: 1698, dtype: datetime64[ns]

In [137]:
ref = datetime.fromisoformat('2016-07-15')
ref2 = datetime.fromisoformat('2016-07-22')
first = datetime.fromisoformat('2016-10-20')
second = datetime.fromisoformat('2016-06-15')
third = datetime.fromisoformat('2016-03-20')
fourth = datetime.fromisoformat('2016-01-10')

In [194]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=count['date'], y=count['freq_per_thousand'],mode='lines', name='Data'))
fig.add_trace(go.Scatter(x=count['date'], y=count['fit_freq'],mode='lines', name='Regression', line_width=3))

fig.update_xaxes(title_text='Date', title_font_size=16, tickfont_size=14)
fig.update_yaxes(title_text='Frequency (‰)', title_font_size=16, tickfont_size=14)
fig.update_layout(title='Frequency of climate change related quotes', title_x=0.5, title_font_size=18,
legend_font_size=14)

fig.add_annotation(x=first, y=2.5, xref='x', yref='y', ax=ref, ay=10.5, axref='x', ayref='y', text='Missing data', showarrow=True,
 arrowhead=1, arrowwidth=1.5)
fig.add_annotation(x=second, y=3, xref='x', yref='y', ax=ref2, ay=10, axref='x', ayref='y', text='', showarrow=True,
 arrowhead=1, arrowwidth=1.5)
fig.add_annotation(x=third, y=1.5, xref='x', yref='y', ax=ref2, ay=10, axref='x', ayref='y', text='', showarrow=True,
 arrowhead=1, arrowwidth=1.5)
fig.add_annotation(x=fourth, y=2, xref='x', yref='y', ax=ref2, ay=10, axref='x', ayref='y', text='', showarrow=True,
 arrowhead=1, arrowwidth=1.5)

fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1 month",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6 months",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="1 year",
                     step="year",
                     stepmode="todate"),
                dict(count=2,
                     label="2 years",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
                ]),
            bgcolor='#EEEFCA'
        ),
        rangeslider=dict(
            visible=True,
            bgcolor='#EFEFEF',
            bordercolor='#C5C3C3',
            thickness=0.25,
            borderwidth=1
        ),
        type="date"
    )
)

#fig.write_html('Frequency.html')
fig.show()

In [185]:
group_by_month = count.groupby('month/year').apply(lambda x: pd.Series({
    'month/year': x['month/year'].values[0],
    'year': x['year'].values[0],
    'month': x['month'].values[0],
    'count': x['count'].sum(),
    'total_count': x['total_count'].sum(),
    'frequency': x['count'].sum()/x['total_count'].sum()
}))

In [186]:
group_by_month.sort_values(['year', 'month'], inplace=True, ignore_index=True)

In [187]:
group_by_month['frequency_per_thousand'] = group_by_month['frequency']*1000

In [188]:
def month(x):
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November',
    'December']
    return months[x-1]
group_by_month['month'] = group_by_month['month'].apply(lambda x: month(x))

In [190]:
group_by_month.head(2)

Unnamed: 0,month/year,year,month,count,total_count,frequency,frequency_per_thousand
0,January 2015,2015,January,2400,1661137,0.001445,1.444794
1,February 2015,2015,February,2282,1659518,0.001375,1.375098


In [195]:
years = [2015, 2016, 2017, 2018, 2019]
fig = go.Figure()
for year in years:
    data = group_by_month[group_by_month['year'] == year]
    fig.add_trace(go.Scatter(x=data['month'], y=data['frequency_per_thousand'], mode='lines', name=f'{year}'))

fig.update_xaxes(title_text='Month', title_font_size=16, tickfont_size=14)
fig.update_yaxes(title_text='Frequency (‰)', title_font_size=16, tickfont_size=14)
fig.update_layout(title='Frequency of climate change related quotes', title_x=0.5, title_font_size=18,
legend_font_size=14)

#fig.write_html('Frequency_by_month.html')
fig.show()

In [196]:
count['dayofyear'] = count['date'].dt.strftime('%-j %Y')
count['dayofyear_v2'] = count['date'].dt.dayofyear

In [197]:
group_by_day = count.groupby('dayofyear').apply(lambda x: pd.Series({
    'dayofyear': x['dayofyear'].values[0],
    'year': x['year'].values[0],
    'day': x['dayofyear_v2'].values[0],
    'count': x['count'].sum(),
    'total_count': x['total_count'].sum(),
    'frequency': x['count'].sum()/x['total_count'].sum()
}))

In [198]:
group_by_day['frequency_per_thousand'] = group_by_day['frequency']*1000

In [199]:
group_by_day.sort_values(['year', 'day'], inplace=True, ignore_index=True)

In [200]:
group_by_day.head(2)

Unnamed: 0,dayofyear,year,day,count,total_count,frequency,frequency_per_thousand
0,1 2015,2015,1,14,27200,0.000515,0.514706
1,2 2015,2015,2,32,36669,0.000873,0.872672


In [202]:
years = [2015, 2016, 2017, 2018, 2019]
fig = go.Figure()
for year in years:
    data = group_by_day[group_by_day['year'] == year]
    fig.add_trace(go.Scatter(x=data['day'], y=data['frequency_per_thousand'], mode='lines', name=f'{year}'))

fig.update_xaxes(title_text='Day of the year', title_font_size=16, tickfont_size=14)
fig.update_yaxes(title_text='Frequency (‰)', title_font_size=16, tickfont_size=14)
fig.update_layout(title='Frequency of climate change related quotes', title_x=0.5, title_font_size=18,
legend_font_size=14)

#fig.write_html('Frequency_by_day.html')
fig.show()

## Map

In [56]:
quotes = pd.read_json('Quotebank_climate_attributes/quotes-all-years.json.bz2', compression='bz2',
 lines=True)
quotes.fillna('None', inplace=True)

In [57]:
nationalities = quotes['nationality'].values

In [58]:
countries = []
for i in range(len(nationalities)):
    if (type(nationalities[i]) == list):
        countries.append(nationalities[i][0])
    elif (nationalities[i] != 'None'):
        countries.append(nationalities[i])

a = len(countries)
print(f'There are {a} quotes with identified country.')

There are 89260 quotes with identified country.


In [59]:
# Remove 'bad' countries
f = open('map_bad_countries.json')
mapping = json.load(f)

keys = mapping.keys()

for i in range(len(countries)):
    for key in keys:
        if(countries[i] == key):
            countries[i] = mapping[key]
            break

In [60]:
countries, count = np.unique(countries, return_counts=True)

sorting = np.argsort(-count) # Sort in decreasing order
countries = countries[sorting]
count = count[sorting]

countries = pd.DataFrame({'country':countries, 'count':count})

In [61]:
codes = pd.read_csv('code_countries.txt', sep=' ', header = None, names=['code', '', 'country'])
codes.drop(labels='', axis=1, inplace=True)

merging = codes.merge(countries, on='country')

b = merging['count'].sum()
print(f'We managed to map {b} out of {a} quotes to the country of the speaker.')

We managed to map 89182 out of 89260 quotes to the country of the speaker.


In [62]:
# Check for missing country which have not been mapped to a country code
missing = []
for a in countries['country'].values:
    if (a not in merging['country'].values):
        missing.append(a)

print(missing)

['None']


In [63]:
merging['count_log'] = np.log10(merging['count'].values)

In [65]:
fig = px.choropleth(merging, locations='code', color='count_log', projection="natural earth",
    color_continuous_scale=px.colors.sequential.Sunset, custom_data=['count', 'country'])

fig.update_traces(hovertemplate="<b>%{customdata[1]}</b> <br> Count: %{customdata[0]}")

fig.update_layout(title='Distribution of the climate related quotes by nationality of speakers', title_x=0.5, title_font_size=18,
legend_font_size=14, coloraxis_colorbar=dict(title='Count', tickprefix='1.e', x=0.92))

#fig.write_html('beautiful_map.html')
fig.show()

## Occupation plot

In [66]:
quotes = pd.read_json('Quotebank_climate_attributes/quotes-all-years.json.bz2', compression='bz2',
 lines=True)
quotes.fillna('None', inplace=True)

In [67]:
speakers = quotes.groupby('speaker').apply(lambda x: pd.Series({
    'occupation': x['occupation'].values[0]
}))

jobs = speakers['occupation'].values

In [86]:
occupations = []
for i in range(len(jobs)):
    a = jobs[i]
    if (type(a) == list):
        for j in range(len(a)):
            if (a[j] is not None):
                occupations.append(a[j])
    elif (a is None):
        pass
    elif (a != 'None'):
        occupations.append(a)

In [87]:
occupations, count = np.unique(occupations, return_counts=True)

sorting = np.argsort(-count) # Sort in decreasing order
occupations = occupations[sorting]
count = count[sorting]

occupations = pd.DataFrame({'occupation':occupations, 'count':count})
occupations['occupation'][11] = 'football player'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [89]:
fig = px.bar(occupations[0:15], x='count', y='occupation', orientation='h')

fig.update_layout(yaxis_autorange="reversed")

fig.update_xaxes(title_text='Count', title_font_size=16, tickfont_size=14)
fig.update_yaxes(title_text='Occupation', title_font_size=16, tickfont_size=14)
fig.update_layout(title='Most represented occupations of speakers about climate change',
 title_x=0.5, title_font_size=18, legend_font_size=14)

#fig.write_html('occupation.html')
fig.show()