In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm

In this notebook we analyze the topic sentiments as they change over time 

In [2]:
# Add line bar to topic plot
def add_topic(start, stop, name, fig, color):
    fig.add_trace(go.Bar(
    y=[name],
    x=[stop - start +1],
    name=name,
    orientation='h',
    base=start,
    marker=dict(
    color=color)
    )
)

In [17]:
# Generate topic occurence plot
import plotly.graph_objects as go
import plotly.express as px
colors = px.colors.qualitative.G10 + px.colors.qualitative.Antique

fig = go.Figure(layout_xaxis_range=[2008,2021])

add_topic(2011, 2020, 'Trade Relations', fig, colors[0])

add_topic(2020, 2021, 'Covid', fig, colors[1])

add_topic(2015, 2016, 'Economy', fig, colors[2])

add_topic(2020, 2020, 'Huawei', fig, colors[3])
add_topic(2018, 2018, 'Huawei', fig, colors[3])

add_topic(2019, 2019, 'Hong Kong', fig, colors[4])
add_topic(2013, 2017, 'Hong Kong', fig, colors[4])

add_topic(2014, 2019, 'Climate', fig, colors[5])
add_topic(2009, 2009, 'Climate', fig, colors[5])

add_topic(2010, 2019, 'South China sea', fig, colors[6])

add_topic(2009, 2019, 'North Korea', fig, colors[7])

add_topic(2008, 2009, 'Olympics', fig, colors[8])
add_topic(2012, 2012, 'Olympics', fig, colors[8])
add_topic(2015, 2016, 'Olympics', fig, colors[8])

add_topic(2010, 2010, 'Iran', fig, colors[9])
add_topic(2012, 2012, 'Iran', fig, colors[9])
add_topic(2015, 2015, 'Iran', fig, colors[9])

add_topic(2010, 2012, 'Chinese currency', fig, colors[10])
add_topic(2020, 2020, 'Chinese currency', fig, colors[10])

add_topic(2008, 2011, 'Dalai Lama', fig, colors[11])

add_topic(2019, 2019, 'Human Rights', fig, colors[12])
add_topic(2014, 2017, 'Human Rights', fig, colors[12])
add_topic(2011, 2012, 'Human Rights', fig, colors[12])
add_topic(2009, 2009, 'Human Rights', fig, colors[12])
fig.update_layout(showlegend=False)
fig.update_layout(barmode='stack')
fig.update_xaxes(dtick=1)
fig.update_layout(title='Occurence of keywords',
                    xaxis_title="Year")
fig.show()
fig.write_html(f"C:/Users/jozef/Desktop/quotebank/topic_overview.html")

In [44]:
# Code to aggergate quotations and topics per year
years = list(range(2008, 2021))
# Mapping from label number to string
label_dict = {2008:{1: 'olympics', 2: 'dalai_lama'},
             2009: {1:'climate', 2:'olympics', 3:'north_korea',
                   4:'dalai_lama', 5:'human_rights'},
             2010: {1:'chinese_currency', 2: 'north_korea',
                   3:'dalai_lama', 4:'iran', 5:'south_china_sea'},
             2011: {1: 'south_china_sea', 2: 'chinese_currency',
                   3:'trade_relations', 4:'north_korea', 5:'dalai_lama',
                   6:'human_rights'},
             2012: {1:'olympics', 2:'south_china_sea', 3:'north_korea',
                   4:'trade_relations', 5:'iran', 6:'chinese_currency',
                   7:'human_rights'},
             2013: {1:'north_korea', 2:'south_china_sea', 3:'hong_kong',
                   4:'trade_relations'},
             2014: {1:'hong_kong', 2:'climate', 3:'trade_relations',
                   4:'south_china_sea', 5:'north_korea', 6:'human_rights'},
             2015: {1:'south_china_sea', 2: 'climate', 3:'economy',
                   4:'hong_kong', 5:'trade_relations', 6:'iran', 7:'north_korea', 
                    8:'olympics', 9:'human_rights'},
             2016: {1:'south_china_sea', 2:'olympics', 3:'north_korea', 
                   4:'climate', 5:'trade_relations', 6:'hong_kong', 
                   7:'economy', 8:'human_rights'},
             2017: {1:'north_korea', 2:'south_china_sea', 3:'trade_relations',
                   4:'hong_kong', 5:'climate', 6:'human_rights'},
             2018: {1:'trade_relations', 2:'huawei', 3:'north_korea', 
                   4:'south_china_sea', 5:'climate'},
             2019: {1:'hong_kong', 2:'trade_relations', 3:'climate',
                   4:'south_china_sea', 5:'north_korea', 6:'human_rights'},
             2020: {1: 'trade_relations', 2: 'covid-19', 3:'chinese_currency',
                   4:'huawei'}}

# Label add others label to mapping
for key in label_dict.keys():
    label_dict[key][-1] = 'others'

df = pd.DataFrame(columns=['year', 'human_rights', 
                           'dalai_lama', 'chinese_currency',
                          'iran', 'olympics', 'north_korea',
                          'south_china_sea', 'climate', 
                          'hong_kong', 'huawei', 'economy',
                          'covid-19', 'trade_relations', 'others', 
                          'average', 'quotation', 'speaker'])

# For every year agregate sentiment scores per topic. 
for year in years:
    print(year)
    df_path = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_{year}.json.bz2'
    label_df_path = f'C:/Users/jozef/Desktop/quotebank/keyword_labels/keyword_label_{year}.csv'
    quote_df = pd.read_json(df_path)
    keyword_df = pd.read_csv(label_df_path)
    
    label_mapping = label_dict[year]
    
    temp_df = pd.DataFrame(columns = df.columns)
    
    sentiment_index = quote_df.positive_sentiment - quote_df.negative_sentiment
    
    # Save useful data for analysis
    temp_df['kw_list'] = quote_df.keywords.apply(lambda x: [y[0] for y in x])
    temp_df['average'] = sentiment_index
    temp_df['others'] = sentiment_index
    temp_df['quotation'] = quote_df.quotation
    temp_df['speaker'] = quote_df.localTopSpeaker
    temp_df['date'] = quote_df.date
    
    # Compute keyword intersections. One quote may regard multiple topics
    for i, group in keyword_df.groupby('keyword_label'):
        if i != -1:
            kws = set(group.keyword)
            mask = temp_df.kw_list.apply(lambda x: len(kws.intersection(x)) == 0)
            label = label_mapping[i]
            temp_df[label] = sentiment_index
            temp_df[label][mask] = pd.NA
            temp_df['others'][~mask] = pd.NA
    temp_df['year'] = year
    df = pd.concat([df, temp_df])
    

2008




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [46]:
df.to_csv('C:/Users/jozef/Desktop/quotebank/sentiment_keyword_full.csv')

In [47]:
# Generate mean sentiment per year per topic
sentiment_df = pd.DataFrame()
for i, group in df.groupby('year'):
    sentiment_df = pd.concat([sentiment_df, group.drop(['kw_list', 'quotation', 'date', 'speaker'], axis =1).mean()], axis =1)
sentiment_df = sentiment_df.transpose()

In [48]:
sentiment_df

Unnamed: 0,year,human_rights,dalai_lama,chinese_currency,iran,olympics,north_korea,south_china_sea,climate,hong_kong,huawei,economy,covid-19,trade_relations,others,average
0,2008.0,,-0.095697,,,0.261426,,,,,,,,,0.070852,0.079425
0,2009.0,-0.023863,0.01273,,,0.186418,-0.087875,,0.014064,,,,,,0.055208,0.049245
0,2010.0,,0.002808,-0.08668,-0.050924,,-0.080562,-0.022839,,,,,,,0.05184,0.031933
0,2011.0,-0.039178,-0.012004,-0.127156,,,-0.142477,-0.058132,,,,,,-0.026845,0.069392,0.057639
0,2012.0,-0.056744,,-0.293169,-0.138991,0.333639,-0.096467,-0.000769,,,,,,0.010187,0.055413,0.065367
0,2013.0,,,,,,-0.060415,-0.118995,,-0.181043,,,,0.128876,0.083257,0.072805
0,2014.0,0.045663,,,,,-0.149262,-0.023752,0.079065,-0.072447,,,,0.183652,0.093318,0.082921
0,2015.0,-0.17334,,,-0.142716,0.307472,-0.14302,-0.052633,0.143752,0.051315,,0.0056,,0.016094,0.085913,0.068643
0,2016.0,-0.077325,,,,0.30364,-0.1107,-0.067467,0.083891,-0.034376,,0.016429,,0.021927,0.066959,0.062093
0,2017.0,-0.264047,,,,,-0.151258,-0.062266,-0.081464,-0.033187,,,,-0.021482,0.08722,-0.009465


In [49]:
sentiment_df.to_csv('C:/Users/jozef/Desktop/quotebank/sentiment_keyword_mean.csv')

In [4]:
sentiment_df = pd.read_csv('C:/Users/jozef/Desktop/quotebank/sentiment_keyword_mean.csv')

In [None]:
sentiment_df

In [15]:
# Code to generate topic sentiment trends
fig = go.Figure(layout_xaxis_range=[2007,2021])
colors = px.colors.qualitative.G10 + px.colors.qualitative.Antique

for i, name in enumerate(sentiment_df.drop(['year', sentiment_df.columns[0]], axis=1).columns):  
    if name == 'average':
        fig.add_traces(go.Scatter(x=sentiment_df['year'], y = sentiment_df[name], name=name, marker=dict(
        color=colors[i])
        ))
    else:
        fig.add_traces(go.Scatter(x=sentiment_df['year'], y = sentiment_df[name], name=name, marker=dict(
        color=colors[i]), visible='legendonly'
        ))
fig.update_layout(height=int(600))
fig.update_layout(title='Topic sentiments over time',
                    xaxis_title="Year",
                    yaxis_title="Sentiment index",)
fig.show()

In [16]:
fig.write_html(f"C:/Users/jozef/Desktop/quotebank/topic_sentiment.html")

In [54]:
# Most quoted quotations related to trade in 2014
df[(df.year == 2014) & ~(df.trade_relations.isna())].quotation.value_counts()[:5]

This has the potential for being an historic agreement,                                                                                                                                                                          22
no bilateral relationship is more important than the U.S. and China's.                                                                                                                                                           21
China's decision to promote its own industry and discriminate against U.S. companies has caused U.S. manufacturers to pay as much as three times more than what their Chinese competitors pay for the exact same rare earths,    16
China's decision to promote its own industry and discriminate against US companies has caused US manufacturers to pay as much as three times more than what their Chinese competitors pay for the exact same rare earths,        10
My view is we've got a lot of room to move yet before we would get anywhere near feeling

In [62]:
# Gather data to show significance of Phase one trade agreement
df_2020 = df[df.year == 2020]
before_15 = df_2020[df_2020.date.apply(lambda x: x.timetuple().tm_yday < 15)]
after_15 = df_2020[df_2020.date.apply(lambda x: x.timetuple().tm_yday >= 15)]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [60]:
before_15.trade_relations.mean()

-0.08052104151437996

In [61]:
after_15.trade_relations.mean()

0.09554796904099915

In [74]:
# Test whether there is a significant difference in sentiment before and after jan 15th 
from scipy.stats import ttest_ind
ttest_ind(before_15[before_15.trade_relations.notnull()].trade_relations, 
          after_15[after_15.trade_relations.notnull()].trade_relations)

Ttest_indResult(statistic=-11.696654879296402, pvalue=2.171262033116424e-31)