In [2]:
import numpy as np
import pandas as pd
import json
import csv
import re
import seaborn as sns
import matplotlib.pyplot as plt

## Background
With our previous attempt with TD-IDF, it seemed that perhaps the context was pulling in too much noise. So this is an attempt to use accurate speaker type info by selecting the "comma addendum" info in sentences that have a <"name">, <"speaker type info">, <"rest of sentence">. This does leave out a lot of sentences that don't follow that particular format, and not all sentences with this format actually have the speaker info contained in the addendum, but it seemed to be the pattern with the most universally accurate speaker type info. At the end are some ideas on why the TD-IDF results for some groups look very unrelated to the subject, and how we could address them. 

## Create Dataset

In [3]:
#load in datasets
qtes = pd.read_csv("GNI88.csv")
arts = pd.read_json("gni88.json", lines=True)

In [27]:
#merge into dataset with just information we want
#columns: Article ID, Source Name, Source Type, Headline, Content
source = qtes[['Article ID', 'Source Name', 'Source Type']]
df = source.merge(arts, on='Article ID', how='inner')
df.drop(['Media Name','Author','Published Date'], axis=1, inplace=True)
df.drop_duplicates(subset='Source Name', inplace=True)

In [33]:
#filter out non-names, clean source name
nonname_flags = ['Unnamed', 'Unknown', 'Official', 'Spokesperson', 'Statement', 'Foreign', 'Lawmaker',
                 'Military','Navy','Advocacy','Journal', 'Analyst', 'Report', 'News', 'Critics', 'Assessment',
                 'Government','Agency', 'Agencies', 'Ministry','Department']

df = df[~df['Source Name'].str.contains('|'.join(nonname_flags), na=False)]
df['Source Name'] = df['Source Name'].str.replace(r'\(.*\)', '', regex=True) #remove (Constituent), (Biden Administration), etc.
df['Source Name'] = df['Source Name'].str.replace('OLD', '') #remove "OLD"
df['Source Name'] = df['Source Name'].str.replace('\n', '')

In [35]:
# lookup dictionary which can convert a speaker type to its classification group in constant time
source_to_group = {'Foreign Gov/Mil Official': "Foreign Government",
                   'Media/Journalist': "External Commentator",
                   'Analyst/Commentator': "External Commentator",
                   'Citizen': "External Commentator",
                   'Blogger': "External Commentator",
                   'Public Polling': "External Commentator",
                   'Partisans/Fmr. Politicians': "External Commentator",
                   'Nuke Organization': "Organization",
                   'International Orgs': "Organization",
                   'Non-Profit/NGO': "Organization",
                   'Think Tanks': "Organization",
                   'Nuke Organization - Other': "Organization",
                   'US Rep. & Staff': "US Congress",
                   'US Senate & Staff': "US Congress",
                   'Federal Official': "US Federal Officials",
                   'State/Local Official': "US Federal Officials",
                   'Judicial Official': "US Federal Officials", 
                   'Former Admin. Officials': "US Federal Officials", 
                   'Regulator': "US Federal Officials",
                   'US Military': "US Defense",
                   'Defense Forces': "US Defense",
                   'Defense': "US Defense",
                   'US Police': "US Defense",
                   'Deputy': "US Defense",
                   'Academic': "Academic",
                   'Nuke Organization - Academic': "Academic",
                   'Nuclear Scientist': "Academic",
                   'Other': "Other",
                   'Chairman': "Other",
                   'Terrorist/Extremist': "Other",
                   'Corporate Official': "Other",
                   'Information minister': "Other",
                   'Religious/Clerical': "Other",
                   'Attorney': "Other", 
                   'Ambassador': "Other", 
                   'Nuclear Official': "Other"
                  }

def assign_source_to_group(source_type):
  """Inputs:
     - source_type: str, source type from quote datafame
     Outputs:
     - str of the bigger speaker category to which source_type belongs"""
  if type(source_type) != str:
    return "Other"
  else:
    return source_to_group[source_type]

In [36]:
# creating a column for label information
df["speaker_group"] = df.apply(lambda row: assign_source_to_group(row["Source Type"]),axis=1)

In [138]:
df.iloc[4000:4005, :]

Unnamed: 0,Article ID,Source Name,Source Type,Headline,Content,speaker_group
48812,5175368,Drew Ivers,Partisans/Fmr. Politicians,Paul is adamant skeptic of US foreign power ; ...,National\r\nPaul is adamant skeptic of US fore...,External Commentator
48821,5175384,Kim Jin Moo,Analyst/Commentator,NKorea refuses to let SKoreans enter joint fac...,All Rights Reserved\r\n \r\n\r\n \r\n\r\n9 of ...,External Commentator
48825,5175384,Hwang Jihwan,Analyst/Commentator,NKorea refuses to let SKoreans enter joint fac...,All Rights Reserved\r\n \r\n\r\n \r\n\r\n9 of ...,External Commentator
48891,5175665,Nickolas Roth,Nuke Organization,Threats and opportunities in nuclear security ...,Threats and opportunities in nuclear security ...,Organization
48924,5175751,Connie Pillich,US Rep. & Staff,The modern costs of the yesteryear bomb,The modern costs of the yesteryear bomb\r\nMed...,US Congress


## Extract Context

In [145]:
def find_sentence(article_id, name, sentence_num=0):
    try:
        text = df[df['Article ID'] == article_id]['Content'].iloc[0]
    except:
        #print("Article ID not found in dataset.")
        return ""
    text = text.lower()
    name = name.lower()
    # convert " to ' and ´ to ' to account for fancy names
    text = text.replace("\"","'")
    text = text.replace("-"," ")
    name = name.replace("´","'")
    name = name.replace("-", " ")
    try:
        sentence = [s+ '.' for s in text.split('.') if name in s][sentence_num]
        return sentence
    except:
        #print("Sentence not found for " + name)
        return ""

def extract_comma_addendum_context(article_id, name, sentence_num=0):
    name = name.lower()
    sentence = find_sentence(article_id, name, sentence_num=0)
    #only get info from <name>, <speaker info>, ...
    try:
        search_obj = re.search(name+',((?<=,)[^,]+(?=,))', sentence) #re.search(r',(?<=,)[^,]+(?=,)')
        return search_obj.group(1)#.group() #turns object into string
    except:
        return ""

In [137]:
#example
print(extract_comma_addendum_context(5175368, "Drew Ivers"))
print(extract_comma_addendum_context(5187052, "Sam Kermanian"))
print(extract_comma_addendum_context(5186981, "Tariq Rauf"))
print(extract_comma_addendum_context(10749378, "Larry Pfeiffer"))

 paul's iowa state chairman
 a senior adviser to the board of the iranian american jewish federation
 director of the disarmament and non proliferation programme at the stockholm international peace research institute
 a top continuity official in the obama administration


In [166]:
df_sample = df #.sample(n=10000)
df_sample = df_sample[~df_sample['Source Name'].str.isnumeric().fillna(True)] #take out non-strings
df_sample.head()

Unnamed: 0,Article ID,Source Name,Source Type,Headline,Content,speaker_group
0,3759306,Moon Jae-in,Foreign Gov/Mil Official,North Korea makes deals and threats,North Korea makes deals and threats\r\n\r\nMed...,Foreign Government
2,3759306,Ri Son Gwon,Foreign Gov/Mil Official,North Korea makes deals and threats,North Korea makes deals and threats\r\n\r\nMed...,Foreign Government
4,3759306,Cho Myoung- Gyon,Foreign Gov/Mil Official,North Korea makes deals and threats,North Korea makes deals and threats\r\n\r\nMed...,Foreign Government
6,3762524,Rob Soofer,Federal Official,"Yes, a cyberattack could spur the president to...","Yes, a cyberattack could spur the president to...",US Federal Officials
7,3762524,Paul Selva,US Military,"Yes, a cyberattack could spur the president to...","Yes, a cyberattack could spur the president to...",US Defense


In [167]:
df_sample['Context'] = df_sample.apply(lambda x: extract_comma_addendum_context(x['Article ID'], x['Source Name']), axis=1)
#extract_comma_addendum_context(df['Article ID'], df['Source Name'])
df_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Article ID,Source Name,Source Type,Headline,Content,speaker_group,Context
0,3759306,Moon Jae-in,Foreign Gov/Mil Official,North Korea makes deals and threats,North Korea makes deals and threats\r\n\r\nMed...,Foreign Government,
2,3759306,Ri Son Gwon,Foreign Gov/Mil Official,North Korea makes deals and threats,North Korea makes deals and threats\r\n\r\nMed...,Foreign Government,
4,3759306,Cho Myoung- Gyon,Foreign Gov/Mil Official,North Korea makes deals and threats,North Korea makes deals and threats\r\n\r\nMed...,Foreign Government,
6,3762524,Rob Soofer,Federal Official,"Yes, a cyberattack could spur the president to...","Yes, a cyberattack could spur the president to...",US Federal Officials,
7,3762524,Paul Selva,US Military,"Yes, a cyberattack could spur the president to...","Yes, a cyberattack could spur the president to...",US Defense,the vice chairman of the joint chiefs
...,...,...,...,...,...,...,...
377528,50062894,Michael Morell,Former Admin. Officials,Analyzing the Climate Security Threat: Key Act...,Media: warontherocks\nAuthor: Erin Sikorsky\nD...,US Federal Officials,
377541,52757294,Toby Warden,Media/Journalist,The U.S.-Australian Alliance Needs a Strategy ...,Australia’s recent decision to acquire nuclear...,External Commentator,
377553,56290470,Sanne Verschuren,Media/Journalist,China’s Hypersonic Weapons Tests Don’t Have to...,"I don’t know if it’s quite a Sputnik moment, b...",External Commentator,
377585,5281465,Jonnathan Hunt,Media/Journalist,Why the arms race is still white hot decades a...,Media: The Washigton PostAuthor: Jonathan Hunt...,External Commentator,


In [168]:
df_context = df_sample[~(df_sample['Context'] == "")]
df_context

Unnamed: 0,Article ID,Source Name,Source Type,Headline,Content,speaker_group,Context
7,3762524,Paul Selva,US Military,"Yes, a cyberattack could spur the president to...","Yes, a cyberattack could spur the president to...",US Defense,the vice chairman of the joint chiefs
8,3762558,Lisa Foxen,Citizen,Missile-alert error reveals uncertainty about ...,All Rights Reserved\r\n\r\n ...,External Commentator,a social worker and mother of two young child...
14,3763339,Mazie Hirono,US Senate & Staff,Hawaii Lacked Safeguards in Mistaken Missile A...,Hawaii Lacked Safeguards in Mistaken Missile A...,US Congress,a democrat
59,3777512,Mark Wright,Federal Official,U.S. Test of a Missile Interceptor Fails Off t...,U.S. Test of a Missile Interceptor Fails Off t...,US Federal Officials,a spokesman for the missile defense agency
83,3784488,Hans Kristensen,Nuke Organization,Will the US sub its new cruise missile for Rus...,Will the US sub its new cruise missile for Rus...,Organization,director of the federation of american scient...
...,...,...,...,...,...,...,...
376725,5197280,Jaber Mohammad,Analyst/Commentator,Saudis bristle at Obama's outreach to Iran ; M...,COMMENTARY\r\nSaudis bristle at Obama's outrea...,External Commentator,a bahraini analyst
376770,5197899,John Dawber,US Military,An unblinking eye toward the sky ; The first s...,COMMENTARY\r\nAn unblinking eye toward the sky...,US Defense,commander of the patriot missile battalion or...
377066,5207733,Author Vincent Intondi,Academic,Common cause:African-American calls for nucle...,Ideas\r\nCommon cause:African-American calls f...,Academic,a professor of african american history at mo...
377080,5208991,Lord Hastings Ismay,International Orgs,Will NATO end with a whimper?; Pact members ar...,Commentary\r\nWill NATO end with a whimper?; P...,Organization,nato was formed 'to keep the russians out


## TD-IDF 
Code Credit: Daniel Chung

In [169]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(df_context['Context']) #(pd.Series(["context"]))

In [170]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names())
tfidf['speaker_group'] = df_context["speaker_group"]
tfidf.head()

Unnamed: 0,00,01,02,03,05,06,07,08,09,10,...,yusof,zaghari,zapesochny,zarif,zecurion,zero,zhang,zone,zour,speaker_group
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [171]:
foreign_gov = tfidf[tfidf['speaker_group']=="Foreign Government"]
tfi_ranked = foreign_gov.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

shadow         0.752172
prefectural    0.494762
fukui          0.494762
former         0.493829
paris          0.491698
secretary      0.481590
wealth         0.469396
bb             0.469396
birmingham     0.469396
captain        0.431362
people         0.420317
foreign        0.416398
write          0.414981
exiled         0.414981
union          0.413269
strategy       0.410851
nation         0.399679
army           0.378380
politics       0.370989
european       0.370287
relations      0.361666
activist       0.360134
korean         0.353184
adviser        0.352862
helped         0.352438
letter         0.352438
staff          0.351315
japan          0.349388
management     0.345317
negotiating    0.343977
dtype: float64

In [172]:
external_commentator = tfidf[tfidf['speaker_group']=="External Commentator"]
tfi_ranked = external_commentator.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

senator           0.732699
teacher           0.622629
young             0.589495
spokeswoman       0.583505
resalat           0.581334
party             0.558458
british           0.551342
beckel            0.538962
bob               0.538962
english           0.514615
drs               0.514559
representative    0.500637
special           0.476622
state             0.469676
executive         0.460164
land              0.457837
sean              0.457733
hannity           0.457733
ambassador        0.451680
newspaper         0.445242
analysis          0.421034
naval             0.416719
geneva            0.404734
endowment         0.404734
korea             0.388842
editor            0.385275
north             0.382870
carnegie          0.381040
systems           0.378542
development       0.375195
dtype: float64

In [173]:
organization = tfidf[tfidf['speaker_group']=="Organization"]
tfi_ranked = organization.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

27                1.000000
cq                0.813338
suspect           0.773627
virtually         0.643274
joined            0.643274
another           0.633641
scientist         0.612775
accountability    0.590164
staff             0.581791
negotiations      0.567119
israeli           0.563280
business          0.537310
cimb              0.521672
lumpur            0.521672
kuala             0.521672
forum             0.511615
canyon            0.508479
trust             0.508479
committeeman      0.508008
alexandria        0.497654
aca               0.491951
war               0.484899
grand             0.483676
javad             0.483480
zarif             0.483480
story             0.483227
legal             0.469128
egypt             0.465503
pacific           0.463666
friends           0.463585
dtype: float64

In [174]:
us_congress = tfidf[tfidf['speaker_group']=="US Congress"]
tfi_ranked = us_congress.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

ambassador       0.825409
source           0.819957
former           0.564535
said             0.543837
all              0.408236
is               0.405245
that             0.260250
reassuring       0.229406
recap            0.229406
priorities       0.218216
ready            0.210276
part             0.210276
our              0.199086
post             0.191146
people           0.179956
the              0.178640
about            0.175702
told             0.173798
cnn              0.172016
command          0.164511
times            0.163228
us               0.154668
military         0.142277
american         0.141696
of               0.106975
and              0.077523
at               0.071533
establishment    0.000000
envoy            0.000000
equity           0.000000
dtype: float64

In [175]:
us_fed_officials = tfidf[tfidf['speaker_group']=="US Federal Officials"]
tfi_ranked = us_fed_officials.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

journalist      1.000000
63              1.000000
colorado        0.804992
tehran          0.657325
critical        0.618556
republican      0.593286
trump           0.537034
adviser         0.517157
norman          0.510445
analyst         0.506304
mustaqbal       0.504000
64              0.485545
lebanon         0.479416
hayat           0.477830
hanoi           0.462588
co              0.460577
founder         0.457728
ali             0.455855
retiree         0.454176
an              0.434257
ayatollah       0.433618
khamenei        0.433618
pacific         0.430235
acclaimed       0.430068
lights          0.430068
proprietor      0.430068
al              0.419944
arabia          0.414676
intelligence    0.413504
poet            0.409090
dtype: float64

In [161]:
us_defense = tfidf[tfidf['speaker_group']=="US Defense"]
tfi_ranked = us_defense.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

admiral        0.849665
vice           0.527323
college        0.412804
naval          0.385048
strategy       0.385048
war            0.374789
satellites     0.350007
associate      0.342482
named          0.332212
includes       0.332212
months         0.319586
ago            0.309792
four           0.309792
boeing         0.295025
policy         0.268827
professor      0.266354
division       0.258744
missile        0.226972
an             0.225842
that           0.208736
defense        0.184934
head           0.177878
and            0.172922
to             0.163548
at             0.160937
of             0.118443
the            0.106107
european       0.000000
evangelical    0.000000
estate         0.000000
dtype: float64

In [162]:
academic = tfidf[tfidf['speaker_group']=="Academic"]
tfi_ranked = academic.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

quinnipiac     0.900289
steinhauser    0.757345
paul           0.653015
eu             0.628158
israel         0.576239
ministry       0.537524
ambassador     0.474847
hayat          0.469602
tsinghua       0.457143
foreign        0.437809
university     0.435292
beijing        0.415282
arabia         0.404911
al             0.395834
saudi          0.381035
strategy       0.378676
newspaper      0.374831
columnist      0.350955
that           0.347145
director       0.340224
to             0.331621
at             0.316546
assign         0.291045
guilt          0.291045
initially      0.291045
might          0.291045
worried        0.291045
fellow         0.279320
while          0.276248
research       0.272324
dtype: float64

In [178]:
other = tfidf[tfidf['speaker_group']=="Other"]
tfi_ranked = other.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

judicial         0.606665
assessments      0.545539
budgetary        0.529562
fox              0.515233
christi          0.490586
pax              0.490586
catholic         0.466655
aviv             0.434776
tel              0.434776
news             0.428411
activist         0.425746
lives            0.419298
stockholm        0.405329
pro              0.401580
programme        0.369796
strategic        0.369123
proliferation    0.349010
non              0.343609
disarmament      0.343609
trump            0.333919
american         0.316201
analyst          0.309688
center           0.306947
peace            0.300911
senior           0.295042
president        0.278343
with             0.247963
international    0.241119
research         0.230971
institute        0.213317
dtype: float64

## Thoughts
- political names seem to simply be prefaced by their speaker type, especially if well-known
    - e.g. "chinese president xijingping", "british foreign secretary boris johnson"
    - idea: have list of most prominent politicians
    - idea: have key words like "Sen.", "Democrat", "Rep.", etc. 
- organization names may been seen as self-explanatory--need to look more into this
- if the first instance of name is prefaced by "said", the info after the comma is almost certainly the speaker info
- more possible formats:
    - <"name"> is... 
    - <"name">, who is...