# Topic Analysis - Re-format Schema 

Grace Chen

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [57]:
label_data = pd.read_csv("preliminary_data/GNI88.csv")
label_data.head()

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Source Religion,Legacy Source Tag,Constituent Group,Media Name,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group
0,2018-01-10,3759306,North Korea makes deals and threats,7023849,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Neutral,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
1,2018-01-10,3759306,North Korea makes deals and threats,7023842,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
2,2018-01-10,3759306,North Korea makes deals and threats,7023839,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
3,2018-01-10,3759306,North Korea makes deals and threats,7023833,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
4,2018-01-10,3759306,North Korea makes deals and threats,7023846,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Positive,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,


In [58]:
label_data["Messages"].unique()

array(['OLD Proliferation', 'OLD Arms Reduction', 'OLD Missile Defense',
       'OLD Spending', 'OLD Foreign Policy Narrative', nan, 'OLD Iran',
       'OLD Other', 'OLD Sequestration', 'US Nuclear and Missile Policy',
       'North Korea', 'Arms Reduction', 'Iran', 'China',
       'Nuclear Consequences', 'India', 'KSA', 'Russia', 'Pakistan',
       'Israel', 'Turkey'], dtype=object)

In [59]:
label_data["Submessages"].unique()

array(['OLD State Level Nuclear Programs', 'OLD US Arsenal',
       'OLD Deployment and Testing', 'OLD Nukes Budget Campaign',
       'OLD Arms Reduction', 'OLD European/Russian Missile Defense',
       'OLD Domestic Programs', 'OLD  Politics', 'OLD Modernization',
       'OLD START Treaty', 'OLD Loose Nuclear Materials',
       'OLD Terrorist Threat', 'OLD Non-Proliferation',
       'OLD Russia Reset', 'OLD INF Treaty', nan, 'OLD NPT',
       'OLD Proliferation', 'OLD Tactical Nukes Treaty',
       'OLD Negotiations', 'OLD Sanctions',
       'OLD Iranian Domestic Politics', 'OLD Other', 'OLD CTBT',
       'OLD Missile Defense', 'OLD Covert Ops',
       'OLD Fissile Material Cut Off Treaty', 'OLD Iran Rapid Response',
       'OLD Spending', 'OLD Foreign Policy Narrative', 'OLD Iran',
       'OLD NSA Budget', 'OLD Sequestration', 'OLD AQ Khan Network',
       'Spending and Modernization', 'North Korea Deployments',
       'North Korea Negotiations', 'North Korea Nuclear Activity',
     

In [60]:
label_data[["year", "month", "day"]] = label_data["artdate"].str.split("-", expand = True)
label_data = label_data.astype({'year': 'int32', 'month': 'int32'})
label_data = label_data[(label_data["year"] > 2020)]
label_data = label_data[~((label_data["year"] > 2020) & (label_data["month"] < 4))]
print(label_data.shape)
label_data.head()

(16614, 31)


Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Media Name,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group,year,month,day
19572,2021-04-02,34716952,"Pakistan, India peace move silences deadly Kas...",58664882,News,Neutral,,,,,...,Associated Press Newswires,,By AIJAZ HUSSAIN and SHEIKH SAALIQ,,"India, Pakistan",India,,2021,4,2
19573,2021-04-02,34752219,"Iran, major powers in the agreement to keep Te...",58487896,News,Neutral,,,,,...,Associated Press Newswires,,"No By-Line,",,Iran,Iran,,2021,4,2
19574,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465753,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,Defense News,,Rachel S . Cohen,,,,,2021,4,3
19575,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465751,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,Defense News,,Rachel S . Cohen,,,,,2021,4,3
19576,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465752,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,Defense News,,Rachel S . Cohen,,,,,2021,4,3


In [61]:
def re_schema_for_country(row):
    # India, Pakistan, Saudi Arabia (KSA), Turkey
    if type(row["Messages"]) == float:
        return ""
    
    if type(row["Messages"]) == str:
        if "India" in row["Messages"]:
            return "India"
        if "Pakistan" in row["Messages"]:
            return "Pakistan"
        if "KSA" in row["Messages"]:
            return "KSA"
        if "Turkey" in row["Messages"]:
            return "Turkey"
        
        if "US Nuclear and Missile Policy" in row["Messages"]:
            return "US Nuclear and Missile Policy"
        
        
        # Iran/Russia/Israel/China/North Korea
        country_list = ["Iran", "Russia", "Israel", "China", "North Korea"]
        for country in country_list:
            if country in row["Messages"]:
                return country
            if row["Submessages"] and country in row["Submessages"]:
                return country
    
    
    if type(row["Submessages"]) == str:
        # treaties
        if (row["Submessages"] == "JCPOA" or row["Submessages"] == "START" 
            or row["Submessages"] == "INF" or row["Submessages"] == "NPT" 
            or row["Submessages"] == "CTBT" 
            or row["Submessages"] == "Tactical Nukes Treaty"
            or row["Submessages"] == "TPNW 2017"):
            return "Treaties"


        # other collapsing  
        if row["Submessages"] == "Worldwide Nukes Policy" :
            return "Arms Reduction"
        if row["Submessages"] == "NATO Arsenal" or  row["Submessages"] == "US/NATO Missile Defense":
            return "US Nuclear and Missile Policy"
        if ((row["Submessages"] == "NNSA Budget" or row["Submessages"] == "Spending and Modernization") 
            or "US Nuclear and Missile Policy" in row["Submessages"]):
            return "US Nuclear and Missile Policy"
        if (row["Submessages"] == "Loose Fissile Materials" or row["Submessages"] == "Health Issues" 
            or row["Submessages"] == "Actinides" or row["Submessages"] == "Nuclear Consequences"
            or "Fissile Material" in row["Submessages"]):
            return "Nuclear consequences"
        if row["Submessages"] == "Terrorist Threat":
            return "Nuclear consequences"
        if row["Submessages"] == "Nuclear Survivors":
            return "Nuclear consequences"
        
    return "Other"


In [62]:
label_data["New Message"] = label_data.apply(re_schema_for_country, axis = 1)
label_data.head()

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group,year,month,day,New Message
19572,2021-04-02,34716952,"Pakistan, India peace move silences deadly Kas...",58664882,News,Neutral,,,,,...,,By AIJAZ HUSSAIN and SHEIKH SAALIQ,,"India, Pakistan",India,,2021,4,2,
19573,2021-04-02,34752219,"Iran, major powers in the agreement to keep Te...",58487896,News,Neutral,,,,,...,,"No By-Line,",,Iran,Iran,,2021,4,2,
19574,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465753,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,,Rachel S . Cohen,,,,,2021,4,3,US Nuclear and Missile Policy
19575,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465751,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,,Rachel S . Cohen,,,,,2021,4,3,US Nuclear and Missile Policy
19576,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465752,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,,Rachel S . Cohen,,,,,2021,4,3,US Nuclear and Missile Policy


In [63]:
label_data[["Quote ID", "New Message"]].groupby("New Message").count()

Unnamed: 0_level_0,Quote ID
New Message,Unnamed: 1_level_1
,355
Arms Reduction,1628
China,1498
India,30
Iran,5370
Israel,120
KSA,24
North Korea,2338
Nuclear consequences,333
Pakistan,91


In [64]:
label_data[label_data["New Message"] == "Other"]

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group,year,month,day,New Message


In [66]:
label_data = label_data[label_data["New Message"] != ""]
label_data.head()

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group,year,month,day,New Message
19574,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465753,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,,Rachel S . Cohen,,,,,2021,4,3,US Nuclear and Missile Policy
19575,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465751,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,,Rachel S . Cohen,,,,,2021,4,3,US Nuclear and Missile Policy
19576,2021-04-03,34970673,Vandenberg likely to host training unit for ne...,58465752,News,Neutral,US Nuclear and Missile Policy,US Nuclear Arsenal,Neutral,,...,,Rachel S . Cohen,,,,,2021,4,3,US Nuclear and Missile Policy
19577,2021-04-03,34970674,Iran opposes any 'step-by-step' easing of US s...,58671291,News,Neutral,Arms Reduction,Iran Negotiations,Negative,,...,,Tal Axelrod,,,Iran,Capitol Hill Media,2021,4,3,Iran
19578,2021-04-03,34970674,Iran opposes any 'step-by-step' easing of US s...,58465761,News,Neutral,Arms Reduction,Iran Sanctions,Neutral,,...,,Tal Axelrod,,,Iran,Capitol Hill Media,2021,4,3,Iran


In [67]:
label_data.to_csv("new_schema.csv")