In [1]:
import numpy as np
import pandas as pd
import json
import csv
import regex as re
import seaborn as sns
import os

In [2]:
# Setting directory structure
root_dir = 'C:\\Users\\yashd\\Desktop\\rethink-media\\quote-classification\\'
data_dir = 'Data\\cleaned_data\\'

In [3]:
# Importing cleaned sample data for EDA
os.chdir(root_dir)
GNI88_df = pd.read_csv(root_dir + data_dir + 'GNI88_cleaned_sample.csv')

In [4]:
# lookup dictionary which can convert a speaker type to its classification group in constant time
source_to_group = {'Foreign Gov/Mil Official': "Foreign Government",
                   'Military': "Foreign Government", #added in manually
                   'EU Official' : "Foreign Government", #added in manually
                   'Former Russian Official' : "Foreign Government", #added in manually
                   'South Korean Official' : "Foreign Government", #added in manually
                   'Media/Journalist': "External Commentator",
                   'Analyst/Commentator': "External Commentator",
                   'Citizen': "External Commentator",
                   'Blogger': "External Commentator",
                   'Public Polling': "External Commentator",
                   'Partisans/Fmr. Politicians': "External Commentator",
                   'Nuke Organization': "Organization",
                   'International Orgs': "International",
                   'Non-Profit/NGO': "Organization",
                   'Think Tanks': "Organization",
                   'Nuke Organization - Other': "Organization",
                   'US Rep. & Staff': "US Congress",
                   'US Senate & Staff': "US Congress",
                   'Federal Official': "US Federal Officials",
                   'State/Local Official': "US Federal Officials",
                   'Judicial Official': "US Federal Officials", 
                   'Former Admin. Officials': "US Federal Officials", 
                   'Regulator': "International",
                   'US Military': "US Defense",
                   'Defense Forces': "US Defense",
                   'Defense': "US Defense",
                   'US Police': "US Defense",
                   'Deputy': "US Defense",
                   'Academic': "Academic",
                   'Nuke Organization - Academic': "Organization",
                   'Nuclear Scientist': "Academic",
                   'Other': "Other",
                   'Chairman': "Other",
                   'Terrorist/Extremist': "Other",
                   'Corporate Official': "Other",
                   'Information minister': "Other",
                   'Religious/Clerical': "Other",
                   'Attorney': "Other", 
                   'Ambassador': "Other",
                   'Former Ambassador' : "Other",
                   'Nuclear Official': "Other",
                   'Activist' : "Other" #added in manually
                  }

In [5]:
# credit: Daniel
def assign_source_to_group(source_type):
  """Inputs:
     - source_type: str, source type from quote datafame
     Outputs:
     - str of the bigger speaker category to which source_type belongs"""
  if type(source_type) != str:
    return "Other"
  else:
    return source_to_group[source_type]

In [6]:
# credit: Tiffany
GNI88_df["Speaker Group"] = GNI88_df.apply(lambda row: assign_source_to_group(row["Source Type"]),axis=1)

In [7]:
GNI88_df['Speaker Group'].value_counts()

US Federal Officials    488
Foreign Government      487
External Commentator    431
Organization            188
US Congress             175
Other                    81
International            61
Academic                 47
US Defense               42
Name: Speaker Group, dtype: int64

In [8]:
%%time
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(GNI88_df['QText'])

Wall time: 360 ms


In [9]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names_out())
tfidf['Speaker Group'] = GNI88_df["Speaker Group"]
tfidf.head()

Unnamed: 0,000,01,06,08,10,100,11,117,12,1251,...,zarif,zebari,zero,zeroed,zhao,zimbabwe,zionist,zone,½very,Speaker Group
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,US Federal Officials
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Academic
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,External Commentator
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,External Commentator
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,International


In [10]:
# Supposed best features within quote text that distinguish foreign government sources
foreign_gov = tfidf[tfidf['Speaker Group']=="Foreign Government"]
tfi_ranked = foreign_gov.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

transparency     1.000000
regrettable      0.913663
alliance         0.768822
anyway           0.704642
reporting        0.703620
motive           0.678958
complies         0.664410
paves            0.664309
hypersonic       0.663833
programme        0.658202
suspension       0.653073
provoked         0.640745
peaceful         0.626072
respond          0.613546
saw              0.611792
immediately      0.610720
works            0.596979
ere              0.592748
icbms            0.585818
decision         0.585396
dragged          0.585348
price            0.583478
stupidest        0.583370
purposes         0.581497
information      0.578012
threats          0.576867
inviolability    0.576527
risks            0.575073
expense          0.567936
needs            0.566924
dtype: float64

In [11]:
# Supposed best features within quote text that distinguish external commentator sources
external_commentator = tfidf[tfidf['Speaker Group']=="External Commentator"]
tfi_ranked = external_commentator.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

balanced      1.000000
deterrence    0.768585
fine          0.729195
inquire       0.711468
true          0.684469
guy           0.610364
privately     0.602120
your          0.594310
too           0.593885
me            0.570042
men           0.568367
rule          0.568328
operate       0.566780
mistake       0.566407
thing         0.556501
crap          0.549951
seoul         0.549910
leaning       0.549647
shed          0.545548
considered    0.543536
nukes         0.540245
unclear       0.538810
khamenei      0.536783
cash          0.535875
ruling        0.534533
got           0.533740
you           0.528882
term          0.528480
vote          0.526156
ought         0.526070
dtype: float64

In [12]:
# Supposed best features within quote text that distinguish organization sources
organization = tfidf[tfidf['Speaker Group']=="Organization"]
tfi_ranked = organization.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

senators         0.752786
ir               0.702909
feet             0.672560
capability       0.672142
ambitious        0.647604
breaker          0.642886
findings         0.613641
contradictory    0.585584
unknowns         0.571284
78               0.562934
dismantlement    0.553648
prominent        0.549619
they             0.545887
agenda           0.534081
billion          0.526825
reac             0.520053
heard            0.517735
look             0.513092
76               0.506082
unstable         0.501340
purpose          0.496696
iaea             0.492063
think            0.485044
strength         0.484053
statements       0.482933
what             0.479758
unannounced      0.479680
launchers        0.477703
kill             0.474025
become           0.473842
dtype: float64

In [14]:
# Supposed best features within quote text that distinguish US congress sources
us_congress = tfidf[tfidf['Speaker Group']=="US Congress"]
tfi_ranked = us_congress.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

supports        0.777522
defend          0.751664
accident        0.736171
true            0.722789
trust           0.652738
capitulating    0.623996
right           0.611072
caution         0.605200
running         0.558650
ratcheted       0.558126
debris          0.557774
highest         0.552155
honor           0.547316
cia             0.544836
text            0.543521
accompanying    0.543521
majority        0.521808
waiting         0.517330
replied         0.516326
serious         0.511555
degree          0.509329
close           0.492846
eliminates      0.485683
w93             0.483123
eased           0.480592
boxing          0.480321
respond         0.477825
friendly        0.474107
creates         0.473665
succeed         0.471564
dtype: float64

In [15]:
# Supposed best features within quote text that distinguish US federal official sources
us_fed_officials = tfidf[tfidf['Speaker Group']=="US Federal Officials"]
tfi_ranked = us_fed_officials.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

implicit       0.815180
defend         0.807438
paying         0.795691
prefers        0.767420
informed       0.738646
alternative    0.714254
bluster        0.714110
gotten         0.699090
tomorrow       0.683082
bite           0.656715
she            0.647778
interview      0.646565
resumption     0.645815
deceptions     0.645572
lick           0.638129
hoped          0.632656
minds          0.631501
formalize      0.621593
conducted      0.621492
now            0.613614
happens        0.605635
dubbing        0.604691
reining        0.601478
intrusive      0.600815
increased      0.582552
resort         0.580608
options        0.577350
preferred      0.576895
seven          0.574873
bank           0.572595
dtype: float64

In [16]:
# Supposed best features within quote text that distinguish US defense sources
us_defense = tfidf[tfidf['Speaker Group']=="US Defense"]
tfi_ranked = us_defense.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

equivalent       0.714039
investigation    0.703242
received         0.644200
grade            0.633402
various          0.632661
assessed         0.586807
wind             0.583162
risky            0.519410
enormously       0.519410
type             0.518817
path             0.515547
here             0.507694
savings          0.501398
identified       0.475690
actually         0.462342
air              0.460489
atlantic         0.444749
well             0.432053
category         0.428060
unsuccessful     0.426214
signer           0.413652
absent           0.413652
warships         0.403661
signs            0.403661
answered         0.402916
we               0.401739
shelter          0.401335
ged              0.401335
there            0.400897
tourism          0.397578
dtype: float64

In [17]:
# Supposed best features within quote text that distinguish academic sources
academic = tfidf[tfidf['Speaker Group']=="Academic"]
tfi_ranked = academic.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

option             0.704566
pushing            0.663678
peaceable          0.607813
ending             0.537388
negotiable         0.520896
mouth              0.513780
disagreement       0.500923
board              0.485431
historic           0.476346
hear               0.468748
matter             0.461420
one                0.460902
advocates          0.431302
accomplishments    0.418524
possible           0.416828
fails              0.414764
our                0.407916
containment        0.401937
seeing             0.399789
feels              0.397066
climate            0.396795
proud              0.381841
still              0.372008
you                0.367939
mideast            0.361544
pendulum           0.359578
swing              0.359578
distracted         0.355274
chaotic            0.355274
theatrics          0.355274
dtype: float64

In [18]:
# Supposed best features within quote text that distinguish other sources
other = tfidf[tfidf['Speaker Group']=="Other"]
tfi_ranked = other.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

paint           0.696367
applicable      0.662309
very            0.644025
dime            0.593447
toll            0.568697
food            0.566047
polish          0.563696
duty            0.563426
attackers       0.560006
definitely      0.546891
demolition      0.543420
pivot           0.541432
plant           0.535110
case            0.531544
react           0.528431
combat          0.525065
stopped         0.522144
breach          0.518461
religious       0.511932
gunpoint        0.511802
technicians     0.511802
special         0.511374
multilateral    0.486205
bash            0.484576
hypersonic      0.478633
didn            0.474784
successful      0.470142
madman          0.470042
northrop        0.463932
directed        0.463258
dtype: float64

In [19]:
# Supposed best features within quote text that distinguish international sources
international = tfidf[tfidf['Speaker Group']=="International"]
tfi_ranked = international.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

hopeful        0.806031
limited        0.758911
practices      0.729898
progress       0.594632
related        0.560234
organized      0.559499
ongoing        0.544969
regrettably    0.534008
discussions    0.530758
financial      0.507258
assessment     0.506564
repository     0.495759
knowledge      0.495759
enrichment     0.489681
solidly        0.488459
enter          0.486831
2003           0.481773
current        0.476930
phase          0.468164
arrived        0.465573
provide        0.459447
agency         0.455951
blasts         0.455593
chamber        0.455593
fakhrizadeh    0.452307
ballistic      0.449954
drive          0.432996
offers         0.432445
position       0.426659
can            0.424393
dtype: float64