## Import Packages

In [1]:
%run import_packages.py

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Section 1 : Loading Various Data and Merging all the transcripts
In our source code we create a dataframe with all the relevant information of the speeches in our dataset.

In [3]:
# df = pd.read_csv('https://s3grouparmenia.s3.eu-central-1.amazonaws.com/data/consolidated_transcripts.csv')
%run consolidating_transcripts.py
print(df.sample(5))

codes/app/requirements.txt
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8093 entries, 0 to 8093
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        8093 non-null   object
 1   Session     8093 non-null   object
 2   Country     8093 non-null   object
 3   Transcript  8093 non-null   object
dtypes: object(4)
memory usage: 316.1+ KB
      Year Session Country                                         Transcript
780   1976      31     NZL  b'Mr. President, it is my great pleasure to co...
790   1976      31     QAT  b"On behalf of the Minister for Foreign Affair...
6075  2008      63     MEX  b'Allow me first of all to congratulate you, \...
5978  2008      63     BEN  b'I warmly \r\ncongratulate the President on h...
5389  2004      59     VUT  b'I bring to this\r\ngathering a warm greeting...


In [4]:
sdi = pd.read_excel('https://s3grouparmenia.s3.eu-central-1.amazonaws.com/data/SDI_data/SDI.xlsx')

  """Entry point for launching an IPython kernel.


## Section 2 : Data Pre-processing

###  DataCleaners

We created a class *DataCleaners* to clean and lemmatize the text, as well as remove stopwords with a custom dictionary.

In [5]:
import cleaners
# custom methods
print(dir(cleaners.DataCleaners)[-3:])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
['clean_text', 'lemmatizer', 'remove_stopwords']


Based on the topic modelling analysis we show below in Section 3, we found out that many words were repeated which were not very meaningful given the context of our analysis. Words such as 'united', 'nations', 'assembly', etc. Here we use a custom dictionary to extract them. 

In [6]:
custom_stopwords = ['united','nations','nation', 'international','society','organization','organizations','member','state',
                                'relations','relation','global','charter','general','assembly','year','ago','/n','/t','/n/n']

We now can pre-process the entire dataset using the DataCleaners class. 

In [7]:
# running this cell may take a while, alternatively run:
# df = pd.read_csv('https://s3grouparmenia.s3.eu-central-1.amazonaws.com/data/consolidated_transcripts.csv')

df['Transcript'] = df['Transcript'].apply(lambda x : cleaners.DataCleaners.clean_text(str(x)))

In [8]:
df['Transcript'] = df['Transcript'].apply(lambda x : cleaners.DataCleaners.remove_stopwords(str(x), custom_stopwords))

In [None]:
df['Transcript'] = df['Transcript'].apply(lambda x : cleaners.DataCleaners.lemmatizer(str(x)))

In [None]:
# Save the cleaned transcripts
df.to_csv('cleaned_transcripts.csv')

## Section 3 : Extract Topics

### N-grams
We define a function to extract the top k ngrams. We decide to plot the 10 most frequent bigrams from our cleaned transcript.


In [None]:
def top_k_ngram(corpus, n = 3, k = 10):
    vec = CountVectorizer(ngram_range=(n, n), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:k]

In [None]:
common_words = top_k_ngram(df['Transcript'].values.astype('U'), 2, 10)
df2 = pd.DataFrame(common_words, columns = ['bigram', 'count'])

fig = go.Figure([go.Bar(x=df2['bigram'], y=df2['count'])])
fig.update_layout(title=go.layout.Title(text="Top 10 bigrams in the text after removing stop words and lemmatization"))
fig.show()

### Latent Dirichlet Allocation (LDA)

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=3,                       
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             max_features=5000,          
                            )

data_vectorized = vectorizer.fit_transform(df['Transcript'].values.astype('U'))

lda_model = LatentDirichletAllocation(n_components=8, # Number of topics
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

In [None]:
# Top 20 most frequent words from each topic found by LDA
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

In [None]:
# column names
topicnames = df_topic_keywords.T.columns
print(topicnames)

In [None]:
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames)

In [None]:
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(df, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

df_topic_theme = df_sent_topic[['Transcript', 'dominant_topic']]
#print(df_document_topic)

print(df_topic_theme.tail())

In [None]:
# We determined the topic names by looking at the top 30 most salient words provided by the LDA analysis.
df_topic_theme['topic_name'] = np.nan
df_topic_theme.info
df_topic_theme.reset_index(inplace=True)
for i in range(0,len(df_topic_theme)):
    if df_topic_theme['dominant_topic'][i] == 0:
        df_topic_theme['topic_name'][i] = 'development of africa'
    elif df_topic_theme['dominant_topic'][i] == 1:
        df_topic_theme['topic_name'][i] = 'human rights'
    elif df_topic_theme['dominant_topic'][i] == 2:
        df_topic_theme['topic_name'][i] = 'international security'
    elif df_topic_theme['dominant_topic'][i] == 3:
        df_topic_theme['topic_name'][i] = 'nuclear politics'
    elif df_topic_theme['dominant_topic'][i] == 4:
        df_topic_theme['topic_name'][i] = 'economic development'
    elif df_topic_theme['dominant_topic'][i] == 5:
        df_topic_theme['topic_name'][i] = 'israel-palestine conflict'
    elif df_topic_theme['dominant_topic'][i] == 6:
        df_topic_theme['topic_name'][i] = 'world peace'
    elif df_topic_theme['dominant_topic'][i] == 7:
        df_topic_theme['topic_name'][i] = 'sustainable development'

In [None]:
# Merge the df_topic_theme with the main dataframe
df1 = df.merge(df_topic_theme, on = 'Transcript', how = 'left')
df = df1
print(df.sample(3))

## Section 4 : Extract Sentiment
In this section we show how we extract the sentiment for each speech using the Textblob and the Vader lexicon.

In [None]:
#We will extract the polarity level for each speech using Textblob
# creating empty lists
df['index'] = df.index
polarity = []
subjectivity = []
index = []

# calculating the polarity and subjectivity level of each transcript
for i in range(len(df)):
    blob_polarity = TextBlob(str(df.Transcript.values[i])).sentiment[0]
    blob_subjectivity = TextBlob(str(df.Transcript.values[i])).sentiment[1]
    polarity.append(blob_polarity)
    subjectivity.append(blob_subjectivity)
    index.append(i)

In [None]:
mydict = {
    'polarity' : polarity,
    'subjectivity' : subjectivity,
    'index' : index
}
scores_textblob = pd.DataFrame(mydict)
df_sentiment = df.merge(scores_textblob,on='index')
print(df_sentiment.sample(3))

In [None]:
#We will extract the polarity level for each speech using Vader lexicon

# Instantiate the sentiment intensity analyzer with the existing lexicon
vader = SentimentIntensityAnalyzer()


list_score = []
list_index = []
for i in range(0,len(df)):
    score = vader.polarity_scores(str(df.Transcript[i]))
    index = i
    list_score.append(score)
    list_index.append(index)

In [None]:
# Convert the list of dicts into a DataFrame
scores_vader = pd.DataFrame(list_score)
scores_vader['index'] = list_index
scores_vader.sample(3)

In [None]:
# Join the DataFrames
sentiment_df = df_sentiment.merge(scores_vader,on='index')
print(sentiment_df.columns)

In [None]:
df = sentiment_df
print(df.sample(5))

## Section 5 : Sustainable development

### Data consolidation
We have [data](https://www.sustainabledevelopmentindex.org/methods) on the sustainable development index from 1990-2019. We now filter the UN speeches to only the ones that refer to sustainable development, and to match the same period.

In [None]:
#filter years between 1990-2019 to match time interval that measures SDI
df2 = df[df['Year'] >= int('1990')]
print(df2['Year'].min())
# Keep only the observations related to sustainable development
sus_df = df2[df2['topic_name'] == 'sustainable development']
print(len(sus_df))

In [None]:
# calculating the number of speeches per country that address sustainable development
group_df = sus_df.groupby(["Country"])[sus_df.columns[6]].count()
group_df = group_df.reset_index()
group_df.rename(columns={sus_df.columns[6]:'count'}, inplace=True)
print(group_df.sample(4))

In [None]:
sdi2 = pd.melt(frame=sdi ,id_vars=["iso", "country"],var_name="Year",value_name="SDI")
sdi_l = sdi2[sdi2['Year'] != '2019']
#sdi_l['Year_x']=sdi_l['Year_x'].astype(int)
print(sdi_l['Year'].max())

In [None]:
# calculating the mean SDI from 1990-2019 per country
group_sdi = sdi_l.groupby(["iso"])[sdi_l.columns[3]].mean()
group_sdi = group_sdi.reset_index()
group_sdi.rename(columns={'SDI':'SDI_mean'}, inplace=True)
print(group_sdi.sample(4))

In [None]:
# In order to visualize the SDI and the honesty ratio we define below, we download the geographical data using geopandas.
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
str(world['name']).lower()
print(world.sample(4))

In [None]:
# merge geographical data with SDI data
merge_df1 = pd.merge(world, group_sdi,how = 'left', left_on=["iso_a3"], right_on=["iso"]).drop(columns='iso')
print(merge_df1.sample(4))

In [None]:
#merge with topic data
map_df = pd.merge(merge_df1, group_df,how = 'left', left_on=["iso_a3"], right_on=["Country"]).drop(columns="Country")
print(map_df.sample(4))

We now normalize the counts of the number of times each country talked about sustainable development.
<br></br>
$$
		speech\: count_{scaled} = \frac{speech\: count - speech\: count_{min}}{speech\:count_{max} - speech\:count_{min}}
		$$
<br></br>
In addition, we defined an *Honesty Ratio* that measures the degree to which the countries actually implement what they talk about during the General Assembly.
<br></br>
		$$
		Honesty\: Ratio = \frac{speech\: count_{scaled}}{mean\: SDI}
		$$

Smaller values of the honesty ratio indicate better coordination from speech to implementation of sustainable development programs.

In [None]:
#normalize count to 0-1 range
diff1 = map_df["count"]-map_df["count"].min()
diff2 = map_df["count"].max()-map_df["count"].min()
map_df["normalized"] = diff1 / diff2
#computing honesty ratio
map_df['index'] = map_df['normalized'] / map_df["SDI_mean"]

print(map_df.sample(5))

In [None]:
# Now that we have the data we need, we can make an interactive plot to compare the countries in terms of SDI and Honesty Ratio.
fig = px.choropleth(map_df,
                    geojson = map_df.geometry,
                    locationmode = 'ISO-3',
                    locations = map_df.iso_a3,
                    color = 'index', ## to plot other variables, insert the name here
                    color_continuous_scale = 'blues',
                    projection = 'orthographic',
                    hover_name = 'name',
                    hover_data = ['SDI_mean', 'index'])
fig.update_geos(fitbounds = 'locations', visible = True)
fig.show()

In [None]:
# We can also view the top/worst performers in Honesty Ratio terms by sorting the dataframe.
#Top 10 worst coordination from speech to implement in improving sustainable environment
map_df[['name','count','SDI_mean','index']].sort_values(by=['index'],ascending=False).head(10)

In [None]:
#Top 10 best coordination
map_df.loc[map_df['count'] > 1].sort_values(by = ['index']).dropna

## Section 6 : Israel-Palestine

In [None]:
print(df.sample(4))

In [None]:
# Keep only speeches with the dominant topic of israel-palestine.
df_palestina = df[df['dominant_topic'] == 5]

# Normalize the data
df_palestina['scaled_polarity'] = whiten(df_palestina['polarity'])
df_palestina['scaled_subjectivity'] = whiten(df_palestina['subjectivity'])
df_palestina['scaled_pos'] = whiten(df_palestina['pos'])
df_palestina['scaled_neg'] = whiten(df_palestina['neg'])
df_palestina['scaled_neu'] = whiten(df_palestina['neu'])

# Our variables of interest for the clustering
variables = ['scaled_polarity','scaled_subjectivity','scaled_pos','scaled_neg','scaled_neu']

In [None]:
# Create dataframes for interval of 10 years
df_1970_1980 = df_palestina[df_palestina['Year'] < 1980]
df_1980_1990 = df_palestina[(df_palestina['Year'] >= 1980) & (df_palestina['Year'] < 1990) ]
df_1990_2000 = df_palestina[(df_palestina['Year'] >= 1990) & (df_palestina['Year'] < 2000) ]
df_2000_2010 = df_palestina[(df_palestina['Year'] >= 2000) & (df_palestina['Year'] < 2010) ]
df_2010_current = df_palestina[df_palestina['Year'] >= 2010]

In [None]:
# Take the mean for countries appearing multiple times
grouped_df_1970_1980 = df_1970_1980.groupby(['Country'])[variables].mean()
grouped_df_1980_1990 = df_1980_1990.groupby(['Country'])[variables].mean()
grouped_df_1990_2000 = df_1990_2000.groupby(['Country'])[variables].mean()
grouped_df_2000_2010 = df_2000_2010.groupby(['Country'])[variables].mean()
grouped_df_2010_current = df_2010_current.groupby(['Country'])[variables].mean()

In [None]:
# How to decide upon the number of clusters?
# Declaring variables for use
distortions = []
num_clusters = range(1, 7)
# Populating distortions for various clusters
for i in num_clusters:
    centroids, distortion = kmeans(grouped_df_1970_1980[variables], i)
    distortions.append(distortion)
# Plotting elbow plot data
elbow_plot_data = pd.DataFrame({'num_clusters': num_clusters,
'distortions': distortions})
sns.lineplot(x='num_clusters', y='distortions',
data = elbow_plot_data)
plt.show()

In [None]:
# Create the cluser labels
cluster_centers,_ = kmeans(grouped_df_1970_1980[variables],3)
grouped_df_1970_1980['cluster_labels_1'], _ = vq(grouped_df_1970_1980[variables],cluster_centers)
# 1980 1990
cluster_centers,_ = kmeans(grouped_df_1980_1990[variables],3)
grouped_df_1980_1990['cluster_labels_2'], _ = vq(grouped_df_1980_1990[variables],cluster_centers)
# 1990 2000
cluster_centers,_ = kmeans(grouped_df_1990_2000[variables],3)
grouped_df_1990_2000['cluster_labels_3'], _ = vq(grouped_df_1990_2000[variables],cluster_centers)
# 2000 2010
cluster_centers,_ = kmeans(grouped_df_2000_2010[variables],3)
grouped_df_2000_2010['cluster_labels_4'], _ = vq(grouped_df_2000_2010[variables],cluster_centers)
# 2010 
cluster_centers,_ = kmeans(grouped_df_2010_current[variables],3)
grouped_df_2010_current['cluster_labels_5'], _ = vq(grouped_df_2010_current[variables],cluster_centers)

In [None]:
# How to interpret the cluster labels
grouped_df_1970_1980.groupby('cluster_labels_1')[variables].mean().plot(kind='bar')