In [1]:
import numpy as np
import pandas as pd
import json
import csv
import regex as re
import seaborn as sns
import os
import re
import string
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

In [2]:
# Setting directory structure
root_dir = 'C:\\Users\\yashd\\Desktop\\rethink-media\\quote-classification\\'
data_dir = 'Data\\cleaned_data\\'

In [3]:
# Importing cleaned sample data for EDA
os.chdir(root_dir)
GNI88_df = pd.read_csv(root_dir + data_dir + 'GNI88_cleaned_sample.csv')

In [4]:
def ultimate_tokenize(sentence):
    # Remove punctuation and digits
    sentence = sentence.translate(str.maketrans('', '', string.punctuation + string.digits))
    return word_tokenize(sentence.lower())
def cleaning(interview):
    
    tokens = ultimate_tokenize(interview)
    from nltk.corpus import stopwords
    
    stops = stopwords.words('english')
    stops.extend(['yeah','hello','ye','yes','okay','ok'])
    stops.extend('.,[,],(,),;,/,-,\',?,",:,<,>,n\'t,|,#,\'s,\",\'re,\'ve,\'ll,\'d,\'re,’'.split(','))
    stops.extend(',')
    
    # 6. Remove stop words. 
    words = [w for w in tokens if not w in stops]
    
    sent = ' '.join(words)
    return sent

In [8]:
GNI88_df['Cleaned QText'] = GNI88_df['QText'].map(cleaning)

In [9]:
# lookup dictionary which can convert a speaker type to its classification group in constant time
source_to_group = {'Foreign Gov/Mil Official': "Foreign Government",
                   'Military': "Foreign Government",
                   'Media/Journalist': "External Commentator",
                   'Analyst/Commentator': "External Commentator",
                   'Citizen': "External Commentator",
                   'Blogger': "External Commentator",
                   'Public Polling': "External Commentator",
                   'Partisans/Fmr. Politicians': "External Commentator",
                   'Nuke Organization': "Organization",
                   'International Orgs': "International",
                   'Non-Profit/NGO': "Organization",
                   'Think Tanks': "Organization",
                   'Nuke Organization - Other': "Organization",
                   'US Rep. & Staff': "US Congress",
                   'US Senate & Staff': "US Congress",
                   'Federal Official': "US Federal Officials",
                   'State/Local Official': "US Federal Officials",
                   'Judicial Official': "US Federal Officials", 
                   'Former Admin. Officials': "US Federal Officials", 
                   'Regulator': "International",
                   'US Military': "US Defense",
                   'Defense Forces': "US Defense",
                   'Defense': "US Defense",
                   'US Police': "US Defense",
                   'Deputy': "US Defense",
                   'Academic': "Academic",
                   'Nuke Organization - Academic': "Organization",
                   'Nuclear Scientist': "Academic",
                   'Other': "Other",
                   'Chairman': "Other",
                   'Terrorist/Extremist': "Other",
                   'Corporate Official': "Other",
                   'Information minister': "Other",
                   'Religious/Clerical': "Other",
                   'Attorney': "Other", 
                   'Ambassador': "Other", 
                   'Nuclear Official': "Other"
                  }

In [10]:
# credit: Daniel
def assign_source_to_group(source_type):
  """Inputs:
     - source_type: str, source type from quote datafame
     Outputs:
     - str of the bigger speaker category to which source_type belongs"""
  if type(source_type) != str:
    return "Other"
  else:
    return source_to_group[source_type]

In [11]:
# credit: Tiffany
GNI88_df["Speaker Group"] = GNI88_df.apply(lambda row: assign_source_to_group(row["Source Type"]),axis=1)

In [12]:
GNI88_df['Speaker Group']

0       US Federal Officials
1                   Academic
2       External Commentator
3       External Commentator
4              International
                ...         
1995    US Federal Officials
1996      Foreign Government
1997            Organization
1998            Organization
1999    US Federal Officials
Name: Speaker Group, Length: 2000, dtype: object

In [13]:
%%time
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(GNI88_df['Cleaned QText'])

Wall time: 151 ms


In [14]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names())
tfidf['Speaker Group'] = GNI88_df["Speaker Group"]
tfidf.head()



Unnamed: 0,abandon,abandoned,abandoning,abandons,abbas,abe,abedini,abes,abide,abiding,...,zero,zeroed,zeroemphasis,zerosum,zhao,zimbabwe,zionist,zone,½very,Speaker Group
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,US Federal Officials
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Academic
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,External Commentator
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,External Commentator
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,International


In [15]:
# Supposed best features within quote text that distinguish foreign government sources
foreign_gov = tfidf[tfidf['Speaker Group']=="Foreign Government"]
tfi_ranked = foreign_gov.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

transparency    1.000000
regrettable     1.000000
information     1.000000
motive          0.856162
alliance        0.796751
reporting       0.796481
anyway          0.786807
programme       0.724570
time            0.721270
ere             0.716422
complies        0.706621
icbms           0.700490
uk              0.699263
expense         0.698716
peaceful        0.689201
purposes        0.682101
price           0.681845
ambitions       0.681570
infringement    0.674203
hypersonic      0.667195
provoked        0.665250
paves           0.665029
facing          0.661381
needs           0.658573
respond         0.650791
hurry           0.641937
dragged         0.640359
differences     0.638612
saw             0.633452
immediately     0.629881
dtype: float64

In [16]:
# Supposed best features within quote text that distinguish external commentator sources
external_commentator = tfidf[tfidf['Speaker Group']=="External Commentator"]
tfi_ranked = external_commentator.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

balanced       1.000000
fine           1.000000
true           0.905471
deterrence     0.833825
inquire        0.751915
thing          0.733268
nukes          0.714092
libertarian    0.679581
rule           0.666728
privately      0.655690
leaning        0.655232
ought          0.643832
shed           0.638912
men            0.637163
mistake        0.629686
khameneis      0.624889
ruling         0.620932
guy            0.618799
prove          0.616971
revelation     0.608595
unclear        0.604300
cash           0.603835
crap           0.600876
weve           0.592167
hes            0.583395
impact         0.574793
deeply         0.574754
plane          0.572393
god            0.569153
thought        0.567019
dtype: float64

In [17]:
# Supposed best features within quote text that distinguish organization sources
organization = tfidf[tfidf['Speaker Group']=="Organization"]
tfi_ranked = organization.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

capability       0.805402
senators         0.784598
ambitious        0.771486
ir               0.761315
feet             0.741038
dealbreaker      0.740316
findings         0.710514
think            0.671833
unknowns         0.660318
presented        0.641899
contradictory    0.637340
agenda           0.636247
plugs            0.629004
purpose          0.617776
look             0.617735
iaeas            0.616366
prominent        0.593058
reac             0.592405
billion          0.582465
evidence         0.569754
heard            0.563494
dismantlement    0.561356
plenty           0.560421
turning          0.548174
launchers        0.546902
fissile          0.546723
become           0.545653
stir             0.545040
research         0.542343
none             0.534420
dtype: float64

In [27]:
# Supposed best features within quote text that distinguish US congress sources
us_congress = tfidf[tfidf['Speaker Group']=="US Congress"]
tfi_ranked = us_congress.max(numeric_only=True).sort_values(ascending=False).to_frame()
tfi_ranked.head(30)

Unnamed: 0,0
supports,0.863876
accident,0.844476
defend,0.794806
true,0.761637
trust,0.709332
capitulating,0.694468
running,0.683837
right,0.657924
ratcheted,0.642685
caution,0.627375


In [19]:
# Supposed best features within quote text that distinguish US federal official sources
us_fed_officials = tfidf[tfidf['Speaker Group']=="US Federal Officials"]
tfi_ranked = us_fed_officials.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

paying          1.000000
defend          0.887244
bluster         0.883575
implicit        0.863814
alternative     0.839260
prefers         0.819592
happens         0.812644
wellinformed    0.807922
great           0.781943
minds           0.768854
gotten          0.754528
know            0.743922
hoped           0.743668
deceptions      0.716107
options         0.715653
lick            0.711316
table           0.698456
tomorrow        0.685133
interview       0.675654
agreement       0.668266
bite            0.664918
isolated        0.661963
resumption      0.657528
dubbing         0.653769
intrusive       0.651412
reneges         0.647139
day             0.640593
conducted       0.639344
contingency     0.634972
talk            0.633281
dtype: float64

In [20]:
# Supposed best features within quote text that distinguish US defense sources
us_defense = tfidf[tfidf['Speaker Group']=="US Defense"]
tfi_ranked = us_defense.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

investigation    1.000000
received         0.837700
equivalent       0.720791
grade            0.693153
various          0.687094
assessed         0.649041
wind             0.639409
path             0.615542
enormously       0.614601
risky            0.614601
type             0.573840
well             0.546131
actually         0.544800
savings          0.539097
cosigner         0.514724
absent           0.514724
identified       0.511457
air              0.502331
atlantic         0.488800
backyard         0.487568
think            0.466125
unsuccessful     0.454873
answered         0.447963
warships         0.443642
signs            0.443642
category         0.443130
cost             0.436566
thing            0.435653
authority        0.434109
shelter          0.433159
dtype: float64

In [21]:
# Supposed best features within quote text that distinguish academic sources
academic = tfidf[tfidf['Speaker Group']=="Academic"]
tfi_ranked = academic.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

option             0.719510
pushing            0.710065
peaceable          0.617026
historic           0.610801
matter             0.591662
negotiable         0.582871
disagreement       0.560522
ending             0.545535
board              0.543186
jonguns            0.533339
mouth              0.533339
hear               0.486593
one                0.478327
accomplishments    0.468567
fourday            0.468567
seeing             0.461615
advocates          0.451971
feels              0.444543
fails              0.434641
storages           0.431631
comfort            0.431631
proud              0.427498
possible           0.423146
containment        0.421199
climate            0.417876
nose               0.417709
punched            0.417709
think              0.412474
cripple            0.402139
mideast            0.394963
dtype: float64

In [22]:
# Supposed best features within quote text that distinguish other sources
other = tfidf[tfidf['Speaker Group']=="Other"]
tfi_ranked = other.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

paint           0.794893
react           0.706452
applicable      0.681660
dime            0.669559
attackers       0.631899
food            0.619189
toll            0.615266
pivot           0.610873
case            0.606750
plant           0.598790
definitely      0.591675
short           0.591047
polish          0.588361
breach          0.584448
demolition      0.576056
stopped         0.571164
gunpoint        0.569454
technicians     0.569454
duty            0.566823
bash            0.546497
didnt           0.540838
multilateral    0.538377
successful      0.530499
combat          0.528231
hypersonic      0.524851
religious       0.521386
anything        0.520133
stands          0.515616
special         0.514457
madman          0.502950
dtype: float64

In [23]:
# Supposed best features within quote text that distinguish international sources
international = tfidf[tfidf['Speaker Group']=="International"]
tfi_ranked = international.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

limited        0.787152
organized      0.775450
hopeful        0.752655
im             0.658415
assessment     0.637129
progress       0.616759
regrettably    0.605527
ongoing        0.605464
practices      0.598653
discussions    0.589676
rd             0.567960
provide        0.564787
enter          0.558413
repository     0.537267
knowledge      0.537267
position       0.524482
financial      0.520706
agency         0.516637
fakhrizadeh    0.509720
solidly        0.509554
phase          0.509469
offers         0.490361
arrived        0.483835
casualties     0.466594
ballistic      0.457907
chamber        0.456653
blasts         0.456653
procurement    0.449869
counterpart    0.448703
industrial     0.448703
dtype: float64

In [24]:
# Supposed best features within quote text that distinguish sources
all_speaker_types = tfidf
tfi_ranked = all_speaker_types.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

regrettable      1.000000
fine             1.000000
transparency     1.000000
information      1.000000
paying           1.000000
balanced         1.000000
investigation    1.000000
true             0.905471
defend           0.887244
bluster          0.883575
supports         0.863876
implicit         0.863814
motive           0.856162
accident         0.844476
alternative      0.839260
received         0.837700
deterrence       0.833825
prefers          0.819592
happens          0.812644
wellinformed     0.807922
capability       0.805402
alliance         0.796751
reporting        0.796481
paint            0.794893
limited          0.787152
anyway           0.786807
senators         0.784598
great            0.781943
organized        0.775450
ambitious        0.771486
dtype: float64