# Topic Analysis #6 - Modeling (Grace Chen)

Grace Chen

## Load Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import collections

In [7]:
features_df = pd.read_csv("final_features_grace.csv")
features_df.head()

Unnamed: 0,Quote ID,Messages,most_frequent_country_3,most_frequent_country_full,most_frequent_headline,max_frequency_context,max_frequency_full,max_frequency_headline,majority
0,58465753,US Nuclear and Missile Policy,U.S.,U.S.,,1.0,1.0,0.0,U.S.
1,58465751,US Nuclear and Missile Policy,U.S.,U.S.,,1.0,1.0,0.0,U.S.
2,58465752,US Nuclear and Missile Policy,,U.S.,,0.0,1.0,0.0,
3,58671291,Iran,Iran,Iran,Iran,0.75,0.619048,0.5,Iran
4,58465761,Iran,Iran,Iran,Iran,0.75,0.619048,0.5,Iran


## One Hot Encoding 

In [9]:
ohe_df = pd.get_dummies(features_df[['most_frequent_country_3',
                            'most_frequent_country_full',
                            'most_frequent_headline',
                            'majority']],drop_first=True)
ohe_df.head()

Unnamed: 0,most_frequent_country_3_India,most_frequent_country_3_Iran,most_frequent_country_3_Israel,most_frequent_country_3_North Korea,most_frequent_country_3_Pakistan,most_frequent_country_3_Russia,most_frequent_country_3_Turkey,most_frequent_country_3_U.S.,most_frequent_country_full_India,most_frequent_country_full_Iran,...,most_frequent_headline_Turkey,most_frequent_headline_U.S.,majority_India,majority_Iran,majority_Israel,majority_North Korea,majority_Pakistan,majority_Russia,majority_Turkey,majority_U.S.
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [27]:
messages = features_df["Messages"].unique()
messages

array(['US Nuclear and Missile Policy', 'Iran', 'Treaties',
       'Nuclear consequences', 'North Korea', 'China', 'Russia',
       'Arms Reduction', 'Pakistan', 'Israel', 'India', 'Turkey'],
      dtype=object)

In [28]:
messages_int_form = np.arange(len(messages))
messages_int_form

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [29]:
message_convert_dict = dict(zip(messages, messages_int_form))

def replace(message):
    return message_convert_dict[message]

In [35]:
y = features_df["Messages"].apply(replace)
y

0        0
1        0
2        0
3        1
4        1
        ..
14202    6
14203    5
14204    5
14205    6
14206    6
Name: Messages, Length: 14207, dtype: int64

In [36]:
remaining_df = features_df[["Quote ID", "max_frequency_context", "max_frequency_full", "max_frequency_headline"]]
remaining_df.head()

Unnamed: 0,Quote ID,max_frequency_context,max_frequency_full,max_frequency_headline
0,58465753,1.0,1.0,0.0
1,58465751,1.0,1.0,0.0
2,58465752,0.0,1.0,0.0
3,58671291,0.75,0.619048,0.5
4,58465761,0.75,0.619048,0.5


In [42]:
X = pd.concat([remaining_df, ohe_df], axis = 1)
X = X.set_index("Quote ID")
X.head()

Unnamed: 0_level_0,max_frequency_context,max_frequency_full,max_frequency_headline,most_frequent_country_3_India,most_frequent_country_3_Iran,most_frequent_country_3_Israel,most_frequent_country_3_North Korea,most_frequent_country_3_Pakistan,most_frequent_country_3_Russia,most_frequent_country_3_Turkey,...,most_frequent_headline_Turkey,most_frequent_headline_U.S.,majority_India,majority_Iran,majority_Israel,majority_North Korea,majority_Pakistan,majority_Russia,majority_Turkey,majority_U.S.
Quote ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58465753,1.0,1.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
58465751,1.0,1.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
58465752,0.0,1.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58671291,0.75,0.619048,0.5,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
58465761,0.75,0.619048,0.5,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Train Test Split

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [45]:
X_train

Unnamed: 0_level_0,max_frequency_context,max_frequency_full,max_frequency_headline,most_frequent_country_3_India,most_frequent_country_3_Iran,most_frequent_country_3_Israel,most_frequent_country_3_North Korea,most_frequent_country_3_Pakistan,most_frequent_country_3_Russia,most_frequent_country_3_Turkey,...,most_frequent_headline_Turkey,most_frequent_headline_U.S.,majority_India,majority_Iran,majority_Israel,majority_North Korea,majority_Pakistan,majority_Russia,majority_Turkey,majority_U.S.
Quote ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58607754,0.500000,0.630435,1.0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
71755613,0.500000,0.416667,0.5,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
58777441,0.500000,0.416667,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77042956,0.500000,0.600000,0.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
58675086,0.500000,0.666667,1.0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73016172,0.750000,0.615385,0.5,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
71757883,1.000000,0.500000,0.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
74392979,0.615385,0.600000,0.5,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
74079653,0.666667,0.526316,1.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [46]:
y_train

5718     1
9850     1
13159    0
13112    4
1697     1
        ..
5218     2
12252    4
1346     1
11646    5
3582     1
Name: Messages, Length: 11365, dtype: int64

## Decision Trees

In [48]:
from sklearn import tree

In [49]:
dt = tree.DecisionTreeClassifier()

In [51]:
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [59]:
train_predict = dt.predict(X_train)
acc_train = np.count_nonzero(train_predict == y_train) / len(y_train)
acc_train

0.8964364276286846

In [60]:
test_predict = dt.predict(X_test)
acc_test = np.count_nonzero(test_predict == y_test) / len(y_test)
acc_test

0.6759324419422942

## Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
rf = RandomForestClassifier()

In [80]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [81]:
train_predict = rf.predict(X_train)
acc_train = np.count_nonzero(train_predict == y_train) / len(y_train)
acc_train

0.8964364276286846

In [82]:
test_predict = rf.predict(X_test)
acc_test = np.count_nonzero(test_predict == y_test) / len(y_test)
acc_test

0.687192118226601

## Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

In [85]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [86]:
train_predict = log_reg.predict(X_train)
acc_train = np.count_nonzero(train_predict == y_train) / len(y_train)
acc_train

0.6794544654641443

In [87]:
test_predict = log_reg.predict(X_test)
acc_test = np.count_nonzero(test_predict == y_test) / len(y_test)
acc_test

0.669247009148487