In [1]:
import numpy as np
import pandas as pd
import json
import csv
import regex as re
import seaborn as sns
import os
import re
import string

In [2]:
# Setting directory structure
root_dir = 'C:\\Users\\yashd\\Desktop\\rethink-media\\quote-classification\\'
data_dir = 'Data\\cleaned_data\\'

In [3]:
import requests
url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
british_to_american_dict = requests.get(url).json()
len(british_to_american_dict.keys())

1730

In [4]:
%%time
# Importing cleaned sample data for EDA
os.chdir(root_dir)
GNI88_df = pd.read_csv(root_dir + data_dir + 'GNI88_cleaned_data.csv', low_memory = False)
# Dropping the row with 1 NaN value in QText
#GNI88_df[~GNI88_df['QText'].isna()]

Wall time: 28.2 s


In [5]:
# lookup dictionary which can convert a speaker type to its classification group in constant time
source_to_group = {'Foreign Gov/Mil Official': "Foreign Government",
                   'Military': "Foreign Government", #added in manually
                   'EU Official' : "Foreign Government", #added in manually
                   'Former Russian Official' : "Foreign Government", #added in manually
                   'South Korean Official' : "Foreign Government", #added in manually
                   'Media/Journalist': "External Commentator",
                   'Analyst/Commentator': "External Commentator",
                   'Citizen': "External Commentator",
                   'Blogger': "External Commentator",
                   'Public Polling': "External Commentator",
                   'Partisans/Fmr. Politicians': "External Commentator",
                   'Nuke Organization': "Organization",
                   'International Orgs': "International",
                   'Non-Profit/NGO': "Organization",
                   'Think Tanks': "Organization",
                   'Nuke Organization - Other': "Organization",
                   'US Rep. & Staff': "US Federal Officials",
                   'US Senate & Staff': "US Federal Officials",
                   'Federal Official': "US Federal Officials",
                   'State/Local Official': "US Federal Officials",
                   'Judicial Official': "US Federal Officials", 
                   'Former Admin. Officials': "US Federal Officials", 
                   'Regulator': "International",
                   'US Military': "US Defense",
                   'Defense Forces': "US Defense",
                   'Defense': "US Defense",
                   'US Police': "US Defense",
                   'Deputy': "US Defense",
                   'Academic': "Academic",
                   'Nuke Organization - Academic': "Organization",
                   'Nuclear Scientist': "Academic",
                   'Other': "Other",
                   'Chairman': "Other",
                   'Terrorist/Extremist': "Other",
                   'Corporate Official': "Other",
                   'Information minister': "Other",
                   'Religious/Clerical': "Other",
                   'Attorney': "Other", 
                   'Ambassador': "Other",
                   'Former Ambassador' : "Other",
                   'Nuclear Official': "Other",
                   'Activist' : "Other" #added in manually
                  }

In [6]:
# credit: Daniel
def assign_source_to_group(source_type):
  """ 
  Inputs:
     - source_type: str, source type from quote datafame
     Outputs:
     - str of the bigger speaker category to which source_type belongs
  """
# Everything not covered in the keys falls under 'Other'
  if type(source_type) != str or source_type not in source_to_group.keys():
    return "Other"
  else:
    return source_to_group[source_type]

In [7]:
# credit: Tiffany
GNI88_df["Speaker Group"] = GNI88_df.apply(lambda row: assign_source_to_group(row["Source Type"]),axis=1)

In [8]:
GNI88_df['Speaker Group'].value_counts()

US Federal Officials    135522
Foreign Government       98928
External Commentator     86408
Organization             37981
Other                    16586
International            11814
Academic                 10623
US Defense               10342
Name: Speaker Group, dtype: int64

In [9]:
%%time
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer(min_df = 500)
sparse_tfidf = tfidfvec.fit_transform(GNI88_df['QText'].values.astype('str'))

Wall time: 1min 8s


In [10]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
tfidf = pd.DataFrame(sparse_tfidf[:sparse_tfidf.shape[0]].toarray(), columns=tfidfvec.get_feature_names_out())
tfidf['Speaker Group'] = GNI88_df["Speaker Group"]
tfidf.head()

Unnamed: 0,000,10,100,11,12,14,15,16,17,18,...,yongbyon,york,you,young,your,zaporizhzhia,zarif,zero,zone,Speaker Group
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,External Commentator
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,US Defense
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,US Defense
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,US Federal Officials
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.19658,0.0,0.0,0.0,0.0,0.0,0.0,US Federal Officials


In [11]:
# Supposed best features within quote text that distinguish foreign government sources
foreign_gov = tfidf[tfidf['Speaker Group']=="Foreign Government"]
tfi_ranked = foreign_gov.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

perfect     1.0
he          1.0
strong      1.0
regime      1.0
red         1.0
groups      1.0
great       1.0
and         1.0
behavior    1.0
reckless    1.0
good        1.0
take        1.0
before      1.0
tehran      1.0
another     1.0
that        1.0
the         1.0
israel      1.0
actions     1.0
measures    1.0
military    1.0
act         1.0
be          1.0
strikes     1.0
standing    1.0
to          1.0
historic    1.0
is          1.0
reset       1.0
iran        1.0
dtype: float64

In [12]:
# Supposed best features within quote text that distinguish external commentator sources
external_commentator = tfidf[tfidf['Speaker Group']=="External Commentator"]
tfi_ranked = external_commentator.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

he             1.0
end            1.0
new            1.0
while          1.0
export         1.0
feel           1.0
for            1.0
absolutely     1.0
war            1.0
geneva         1.0
great          1.0
guy            1.0
more           1.0
wolf           1.0
sanctions      1.0
announced      1.0
and            1.0
an             1.0
said           1.0
in             1.0
risk           1.0
we             1.0
right          1.0
maybe          1.0
respond        1.0
republicans    1.0
well           1.0
it             1.0
what           1.0
readiness      1.0
dtype: float64

In [13]:
# Supposed best features within quote text that distinguish organization sources
organization = tfidf[tfidf['Speaker Group']=="Organization"]
tfi_ranked = organization.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

reality        1.0
tuesday        1.0
quite          1.0
senators       1.0
an             1.0
of             1.0
and            1.0
don            1.0
no             1.0
the            1.0
missile        1.0
not            1.0
end            1.0
iranians       1.0
is             1.0
by             1.0
high           1.0
significant    1.0
dangerous      1.0
modern         1.0
condition      1.0
in             1.0
total          1.0
totally        1.0
more           1.0
it             1.0
weak           1.0
national       1.0
him            1.0
limits         1.0
dtype: float64

In [15]:
# Supposed best features within quote text that distinguish US federal official sources
us_fed_officials = tfidf[tfidf['Speaker Group']=="US Federal Officials"]
tfi_ranked = us_fed_officials.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

bombs          1.0
big            1.0
citizens       1.0
everything     1.0
for            1.0
provocation    1.0
war            1.0
had            1.0
diplomat       1.0
being          1.0
diplomatic     1.0
language       1.0
direct         1.0
that           1.0
large          1.0
of             1.0
confidence     1.0
team           1.0
quite          1.0
or             1.0
end            1.0
course         1.0
think          1.0
they           1.0
him            1.0
we             1.0
public         1.0
china          1.0
obligations    1.0
in             1.0
dtype: float64

In [16]:
# Supposed best features within quote text that distinguish US defense sources
us_defense = tfidf[tfidf['Speaker Group']=="US Defense"]
tfi_ranked = us_defense.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

weapon         1.000000
significant    1.000000
at             1.000000
and            1.000000
technical      1.000000
of             1.000000
as             1.000000
right          1.000000
failed         1.000000
no             1.000000
absolutely     1.000000
figure         1.000000
do             1.000000
was            1.000000
continues      0.978860
sense          0.961994
includes       0.946695
missile        0.942023
quite          0.939917
senator        0.919043
grade          0.904765
force          0.903630
planned        0.900066
testing        0.889684
koreans        0.881353
have           0.875684
nobody         0.874204
exercise       0.870016
degree         0.868204
closed         0.866976
dtype: float64

In [17]:
# Supposed best features within quote text that distinguish academic sources
academic = tfidf[tfidf['Speaker Group']=="Academic"]
tfi_ranked = academic.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

on            1.000000
limited       1.000000
it            1.000000
are           1.000000
very          1.000000
fail          1.000000
western       0.955138
complex       0.939251
don           0.936197
was           0.929971
second        0.923042
problem       0.922286
definitely    0.916769
challenge     0.914372
political     0.911434
break         0.909974
claim         0.908770
pressure      0.889169
boost         0.885022
seems         0.877534
am            0.872446
think         0.871049
geneva        0.870673
create        0.868965
future        0.867319
budget        0.865673
talk          0.859919
worse         0.857654
iranians      0.844731
feel          0.835533
dtype: float64

In [18]:
# Supposed best features within quote text that distinguish other sources
other = tfidf[tfidf['Speaker Group']=="Other"]
tfi_ranked = other.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

korea            1.000000
imminent         1.000000
is               1.000000
destabilizing    1.000000
and              1.000000
none             1.000000
false            1.000000
significant      1.000000
another          1.000000
group            0.979319
attacks          0.978140
information      0.977462
urged            0.960909
consequences     0.960620
bill             0.942661
missile          0.942023
firing           0.937102
re               0.935168
currently        0.932141
strongly         0.931726
old              0.928433
certainly        0.927862
almost           0.925197
someone          0.911746
fight            0.909235
meeting          0.898669
china            0.896448
history          0.887377
percent          0.886603
failed           0.883169
dtype: float64

In [19]:
# Supposed best features within quote text that distinguish international sources
international = tfidf[tfidf['Speaker Group']=="International"]
tfi_ranked = international.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

is             1.000000
political      1.000000
game           1.000000
double         1.000000
vienna         1.000000
and            1.000000
very           1.000000
strong         1.000000
useful         1.000000
many           1.000000
credible       1.000000
quite          1.000000
important      1.000000
development    1.000000
enemy          1.000000
it             1.000000
progress       1.000000
concern        1.000000
thursday       0.977591
sound          0.974631
multiple       0.968623
provocative    0.968086
highly         0.964659
resolutions    0.963956
information    0.957389
halt           0.931577
obviously      0.920194
outcome        0.895733
tehran         0.890706
choice         0.878926
dtype: float64