In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [2]:
treaty_message_table = pd.read_csv('treaty_message_table')

In [3]:
treaty_message_table

Unnamed: 0.1,Unnamed: 0,treaty,most common message
0,0,the nuclear nonproliferation treaty,OLD Iran
1,1,the nonproliferation treaty,OLD Iran
2,2,the north atlantic treaty,OLD Missile Defense
3,3,the new start treaty,OLD Arms Reduction
4,4,the intermediaterange nuclear forces treaty,OLD Arms Reduction
...,...,...,...
127,127,the arms trade treaty,OLD Missile Defense
128,128,the panama canal treaty,OLD Iran
129,129,the iran nuclear treaty,OLD Proliferation
130,130,the oslo treaty,OLD Proliferation


In [4]:
merged_df = pd.read_csv('merged_df')

In [5]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,...,Article Issues,Custom Group,Media Group,Content,Author,Published Date,First_10%_Content,First_10%_Content_Split,Treaty_Instances,Treaties2
0,0,2018-01-10,3759306,North Korea makes deals and threats,7023849,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Neutral,...,,,,North Korea makes deals and threats\r\n\r\nMed...,"Fifield, Anna",2018-01-10 00:00:00+00,SEOUL - North Korea's representatives assured ...,"['SEOUL', '-', 'North', ""Korea's"", 'representa...",[],
1,1,2018-01-10,3759306,North Korea makes deals and threats,7023842,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,...,,,,North Korea makes deals and threats\r\n\r\nMed...,"Fifield, Anna",2018-01-10 00:00:00+00,SEOUL - North Korea's representatives assured ...,"['SEOUL', '-', 'North', ""Korea's"", 'representa...",[],
2,2,2018-01-10,3759306,North Korea makes deals and threats,7023839,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,...,,,,North Korea makes deals and threats\r\n\r\nMed...,"Fifield, Anna",2018-01-10 00:00:00+00,SEOUL - North Korea's representatives assured ...,"['SEOUL', '-', 'North', ""Korea's"", 'representa...",[],
3,3,2018-01-10,3759306,North Korea makes deals and threats,7023833,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,...,,,,North Korea makes deals and threats\r\n\r\nMed...,"Fifield, Anna",2018-01-10 00:00:00+00,SEOUL - North Korea's representatives assured ...,"['SEOUL', '-', 'North', ""Korea's"", 'representa...",[],
4,4,2018-01-10,3759306,North Korea makes deals and threats,7023846,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Positive,...,,,,North Korea makes deals and threats\r\n\r\nMed...,"Fifield, Anna",2018-01-10 00:00:00+00,SEOUL - North Korea's representatives assured ...,"['SEOUL', '-', 'North', ""Korea's"", 'representa...",[],


In [6]:
# extract articles with mentions of treaties
treaty_articles = merged_df.dropna(subset=['Treaties2'])

In [7]:
treaty_articles = treaty_articles.loc[treaty_articles['Treaties2'].isin(treaty_message_table['treaty'])]

In [8]:
predicted_message = []
for i in np.arange(len(treaty_articles)):
    # find treaty in treaty_message_table, append most common message
    treaty = treaty_articles.reset_index()['Treaties2'][i]
    treaty_message = treaty_message_table.loc[treaty_message_table['treaty'] == treaty]['most common message'].item()
    predicted_message.append(treaty_message)
    

In [9]:
true_predicted_message_table = treaty_articles[['Messages']].reset_index(drop=True)
true_predicted_message_table['predicted_message'] = predicted_message
true_predicted_message_table = true_predicted_message_table.rename(columns= {'Messages': 'true_message'})

In [10]:
true_predicted_message_table

Unnamed: 0,true_message,predicted_message
0,OLD Arms Reduction,OLD Arms Reduction
1,OLD Arms Reduction,OLD Arms Reduction
2,OLD Arms Reduction,OLD Arms Reduction
3,OLD Proliferation,OLD Iran
4,OLD Proliferation,OLD Iran
...,...,...
10584,OLD Arms Reduction,OLD Arms Reduction
10585,OLD Proliferation,OLD Arms Reduction
10586,OLD Missile Defense,OLD Missile Defense
10587,OLD Arms Reduction,OLD Missile Defense


In [11]:
correct = sum(true_predicted_message_table['true_message'] == true_predicted_message_table['predicted_message'])
total = len(true_predicted_message_table)
accuracy = correct/total
accuracy

0.5761639437151761

# relationship between Treaty and Topic (submessages)

In [12]:
len(merged_df['Treaties2'].value_counts())

134

In [13]:
submessages_treates = merged_df[['Treaties2', 'Submessages']]

In [14]:
merged_df['Submessages'].value_counts()

OLD Negotiations                        87980
OLD State Level Nuclear Programs        73337
OLD Sanctions                           29007
OLD Iranian Domestic Politics           23389
OLD Deployment and Testing              17811
OLD Russia Reset                        10396
OLD Domestic Programs                    7774
OLD European/Russian Missile Defense     7164
OLD  Politics                            5967
OLD US Arsenal                           4994
OLD Covert Ops                           4368
OLD Iran Rapid Response                  4208
OLD Iran                                 4190
OLD Loose Nuclear Materials              3064
OLD Arms Reduction                       2754
OLD START Treaty                         2681
OLD Modernization                        2160
OLD Nukes Budget Campaign                1620
OLD Terrorist Threat                     1565
OLD Foreign Policy Narrative             1533
OLD Other                                 779
OLD NPT                           

In [15]:
def submessage_treaty_table(treaty, topic_column):
    table =  merged_df.loc[merged_df['Treaties2'] == treaty].groupby(by = topic_column).count()
    return table.reset_index().iloc[:,0:2].rename(columns = {'Unnamed: 0' : 'count'})

In [16]:
submessage_treaty_table('the nuclear nonproliferation treaty', 'Submessages')

Unnamed: 0,Submessages,count
0,NPT,1
1,OLD Politics,11
2,OLD AQ Khan Network,2
3,OLD Arms Reduction,117
4,OLD CTBT,5
5,OLD Covert Ops,16
6,OLD Deployment and Testing,48
7,OLD Domestic Programs,6
8,OLD European/Russian Missile Defense,22
9,OLD Fissile Material Cut Off Treaty,3


In [17]:
treaties_unique = pd.DataFrame(merged_df['Treaties2'].value_counts())
treaties_unique#.loc[treaties_unique['Treaties2'] > 50]

Unnamed: 0,Treaties2
the nuclear nonproliferation treaty,4043
the nonproliferation treaty,879
the north atlantic treaty,741
the new start treaty,545
the intermediaterange nuclear forces treaty,543
...,...
the panama canal treaty,1
the iran nuclear treaty,1
the oslo treaty,1
the organization's founding washington treaty,1


# find proportion of articles with the given treaty in the different Submessages


In [18]:
def submessage_treaty_props(treaty, topic_column):
    table =  merged_df.loc[merged_df['Treaties2'] == treaty].groupby(by = topic_column).count()
    table = table.reset_index().iloc[:,0:2].rename(columns = {'Unnamed: 0' : 'count'})
    total = sum(table['count'])
    props = []
    for i in np.arange(len(table)):
        props.append(table['count'][i] / total)
    table['props'] = props
    table['treaty'] = treaty
    return table[['treaty', topic_column, 'props']]



In [19]:
# treaties most --> least common
treaties_list = treaties_unique.index

In [20]:
treaty_submessage_table = submessage_treaty_props(treaties_list[0], 'Submessages')
treaty_submessage_table= pd.pivot_table(treaty_submessage_table, values = 'props', index = ['treaty'], columns = ['Submessages'])

for i in treaties_list[1:]:
    table = submessage_treaty_props(i, 'Submessages')
    table = pd.pivot_table(table, values = 'props', index = ['treaty'], columns = ['Submessages'])
    treaty_submessage_table = pd.concat([treaty_submessage_table, table])

In [21]:
pd.set_option('display.max_rows', len(treaty_submessage_table))
treaty_submessage_table

Submessages,NPT,OLD Politics,OLD AQ Khan Network,OLD Arms Reduction,OLD CTBT,OLD Covert Ops,OLD Deployment and Testing,OLD Domestic Programs,OLD European/Russian Missile Defense,OLD Fissile Material Cut Off Treaty,...,OLD Sanctions,OLD State Level Nuclear Programs,OLD Tactical Nukes Treaty,OLD Terrorist Threat,OLD US Arsenal,US Nuclear Arsenal,OLD INF Treaty,OLD Spending,OLD NSA Budget,OLD Sequestration
treaty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the nuclear nonproliferation treaty,0.000247,0.002721,0.000495,0.028939,0.001237,0.003957,0.011872,0.001484,0.005442,0.000742,...,0.054662,0.245115,0.003463,0.00371,0.009646,0.000247,,,,
the nonproliferation treaty,,0.003417,,0.053531,,0.003417,,0.003417,0.001139,,...,0.048975,0.216401,,,0.005695,,,,,
the north atlantic treaty,,0.096467,,0.004076,,0.002717,0.058424,0.008152,0.326087,,...,0.027174,0.142663,0.024457,0.001359,0.032609,,0.001359,0.017663,,
the new start treaty,,0.031193,,0.066055,0.005505,,0.007339,0.016514,0.13945,,...,0.00367,0.033028,0.00367,0.005505,0.102752,,,,0.023853,0.007339
the intermediaterange nuclear forces treaty,,0.037244,,0.178771,,,0.022346,0.050279,0.122905,,...,0.003724,0.117318,,,0.054004,,0.040968,,,
the treaty,,0.017045,,0.034091,0.071023,,0.028409,0.002841,0.025568,0.014205,...,0.028409,0.269886,0.053977,,0.045455,,,,,
the comprehensive test ban treaty,,0.002959,,0.065089,0.556213,,0.026627,0.005917,0.014793,,...,0.002959,0.147929,,0.005917,0.011834,,,,0.002959,
the antiballistic missile treaty,,0.043137,,0.086275,,,0.031373,0.168627,0.282353,,...,,0.035294,,,0.035294,,,,,
the inf treaty,,0.004878,,0.160976,,,0.004878,0.087805,0.180488,,...,0.004878,0.063415,0.004878,0.004878,0.068293,,0.204878,,,
the comprehensive nucleartestban treaty,,,,,0.08982,,0.053892,,,0.005988,...,,0.664671,,,0.11976,,,,,


In [22]:
# table with treaty & most common message (highest prop)
most_common_submessage = []
for i in np.arange(len(treaty_submessage_table)):
    most_common_submessage.append(pd.DataFrame(treaty_submessage_table.iloc[i, :]).idxmax()[0])

In [23]:
treaty_submessage_table['most common submessage'] = most_common_submessage

In [24]:
treaty_submessage_table = treaty_submessage_table.reset_index()[['treaty', 'most common submessage']]
treaty_submessage_table = treaty_submessage_table.rename_axis(None, axis=1)

In [25]:
treaty_submessage_table

Unnamed: 0,treaty,most common submessage
0,the nuclear nonproliferation treaty,OLD Negotiations
1,the nonproliferation treaty,OLD Negotiations
2,the north atlantic treaty,OLD European/Russian Missile Defense
3,the new start treaty,OLD START Treaty
4,the intermediaterange nuclear forces treaty,OLD START Treaty
5,the treaty,OLD State Level Nuclear Programs
6,the comprehensive test ban treaty,OLD CTBT
7,the antiballistic missile treaty,OLD European/Russian Missile Defense
8,the inf treaty,OLD INF Treaty
9,the comprehensive nucleartestban treaty,OLD State Level Nuclear Programs


# accuracy of topic (submessages) classification based on treaty

In [26]:
# extract articles with mentions of treaties
treaty_articles = merged_df.dropna(subset=['Treaties2'])

In [27]:
treaty_articles = treaty_articles.loc[treaty_articles['Treaties2'].isin(treaty_submessage_table['treaty'])]

In [28]:
predicted_submessage = []
for i in np.arange(len(treaty_articles)):
    # find treaty in treaty_message_table, append most common message
    treaty = treaty_articles.reset_index()['Treaties2'][i]
    treaty_submessage = treaty_submessage_table.loc[treaty_submessage_table['treaty'] == treaty]['most common submessage'].item()
    predicted_submessage.append(treaty_submessage)
    

In [29]:
true_predicted_submessage_table = treaty_articles[['Submessages']].reset_index(drop=True)
true_predicted_submessage_table['predicted_submessage'] = predicted_submessage
true_predicted_submessage_table = true_predicted_submessage_table.rename(columns= {'Submessages': 'true_submessage'})

In [30]:
true_predicted_submessage_table

Unnamed: 0,true_submessage,predicted_submessage
0,OLD Arms Reduction,OLD Arms Reduction
1,OLD Arms Reduction,OLD Arms Reduction
2,OLD Arms Reduction,OLD Arms Reduction
3,OLD State Level Nuclear Programs,OLD Negotiations
4,OLD State Level Nuclear Programs,OLD Negotiations
...,...,...
10584,OLD START Treaty,OLD INF Treaty
10585,OLD State Level Nuclear Programs,OLD INF Treaty
10586,OLD Deployment and Testing,OLD European/Russian Missile Defense
10587,OLD INF Treaty,OLD European/Russian Missile Defense


In [31]:
correct = sum(true_predicted_submessage_table['true_submessage'] == true_predicted_submessage_table['predicted_submessage'])
total = len(true_predicted_submessage_table)
accuracy = correct/total
accuracy

0.4033430918878081

# only post april 2020 data

In [33]:
post_april_2020 = merged_df.loc[merged_df['Published Date'] >= '2020-04-00 00:00:00+00']

In [34]:
post_april_2020['Submessages'].unique()

array([nan, 'Loose Fissile Materials', 'US/NATO Missile Defense',
       'Conflict with North Korea', 'Health Issues',
       'OLD State Level Nuclear Programs', 'JCPOA', 'Iran Negotiations',
       'Spending and Modernization', 'US Nuclear Arsenal', 'NPT',
       'Worldwide Nukes Policy'], dtype=object)

find proportion of articles with the given treaty in the different Submessages


In [35]:
def submessage_treaty_props2(treaty, topic_column):
    table =  post_april_2020.loc[post_april_2020['Treaties2'] == treaty].groupby(by = topic_column).count()
    table = table.reset_index().iloc[:,0:2].rename(columns = {'Unnamed: 0' : 'count'})
    total = sum(table['count'])
    props = []
    for i in np.arange(len(table)):
        props.append(table['count'][i] / total)
    table['props'] = props
    table['treaty'] = treaty
    return table[['treaty', topic_column, 'props']]



In [36]:
treaty_submessage_table2 = submessage_treaty_props2(treaties_list[0], 'Submessages')
treaty_submessage_table2= pd.pivot_table(treaty_submessage_table2, values = 'props', index = ['treaty'], columns = ['Submessages'])

for i in treaties_list[1:]:
    table = submessage_treaty_props2(i, 'Submessages')
    table = pd.pivot_table(table, values = 'props', index = ['treaty'], columns = ['Submessages'])
    treaty_submessage_table2 = pd.concat([treaty_submessage_table2, table])

In [37]:
pd.set_option('display.max_rows', len(treaty_submessage_table2))
treaty_submessage_table2

Submessages,NPT,US Nuclear Arsenal
treaty,Unnamed: 1_level_1,Unnamed: 2_level_1
the nuclear nonproliferation treaty,0.5,0.5


In [38]:
# table with treaty & most common message (highest prop)
most_common_submessage2 = []
for i in np.arange(len(treaty_submessage_table2)):
    most_common_submessage2.append(pd.DataFrame(treaty_submessage_table2.iloc[i, :]).idxmax()[0])

In [39]:
treaty_submessage_table2['most common submessage'] = most_common_submessage2

In [40]:
treaty_submessage_table2 = treaty_submessage_table2.reset_index()[['treaty', 'most common submessage']]
treaty_submessage_table2 = treaty_submessage_table2.rename_axis(None, axis=1)

In [41]:
treaty_submessage_table2

Unnamed: 0,treaty,most common submessage
0,the nuclear nonproliferation treaty,NPT


accuracy of topic (submessages) classification based on treaty

In [42]:
# extract articles with mentions of treaties
treaty_articles2 = post_april_2020.dropna(subset=['Treaties2'])

In [43]:
treaty_articles2 = treaty_articles2.loc[treaty_articles2['Treaties2'].isin(treaty_submessage_table2['treaty'])]

In [44]:
predicted_submessage2 = []
for i in np.arange(len(treaty_articles2)):
    # find treaty in treaty_message_table, append most common message
    treaty = treaty_articles2.reset_index()['Treaties2'][i]
    treaty_submessage = treaty_submessage_table2.loc[treaty_submessage_table2['treaty'] == treaty]['most common submessage'].item()
    predicted_submessage2.append(treaty_submessage)
    

In [45]:
true_predicted_submessage_table2 = treaty_articles2[['Submessages']].reset_index(drop=True)
true_predicted_submessage_table2['predicted_submessage'] = predicted_submessage2
true_predicted_submessage_table2 = true_predicted_submessage_table2.rename(columns= {'Submessages': 'true_submessage'})

In [46]:
true_predicted_submessage_table2

Unnamed: 0,true_submessage,predicted_submessage
0,US Nuclear Arsenal,NPT


In [47]:
correct = sum(true_predicted_submessage_table2['true_submessage'] == true_predicted_submessage_table2['predicted_submessage'])
total = len(true_predicted_submessage_table2)
accuracy = correct/total
accuracy

0.5

main findings
- treaties better for classifying message topics (than submessage topic categories)
    - accuracies:
        - presence of certain treaty (treaty, Messages) ~ 0.58 (entire dataset)
        - presence of certain treaty (treaty, Submessages) ~ 0.40 (entire dataset)
        - presence of certain treaty (treaty, Submessages) ~ 0.5 (articles after April 2020)