In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import ttest_1samp
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
from sklearn import svm
import csv, sqlite3
%matplotlib inline

from GDELT_utils import GDELT_columns, usecols, dtype_dict, \
                        cameo_dict, map_cameo_to_text, \
                        state_dict, mem_usage, state_heat_map

In [3]:
con = sqlite3.connect("gdelt.db")

In [4]:
gdelt_sql = pd.read_sql_query("SELECT SQLDATE, Actor1CountryCode, Actor2CountryCode, \
                              Actor1Type1Code, Actor2Type1Code, \
                            Actor1Geo_CountryCode, Actor2Geo_CountryCode, \
                            EventRootCode, AVG(NumMentions), AVG(AvgTone) \
                          FROM gdelt \
                          WHERE SQLDATE > \"2017-05-01\" AND SQLDATE < \"2017-08-01\" \
                          GROUP BY SQLDATE, Actor1CountryCode, Actor2CountryCode, \
                          Actor1Type1Code, Actor2Type1Code, \
                          Actor1Geo_CountryCode, Actor2Geo_CountryCode, EventRootCode", con)
# aggregate data by
# Date Actor1Code Actor2Code Actor1Geo_CountryCode Actor2Geo_CountryCode EventRootCode -> Mean AvgTone

# Features Actor1Code Actor2Code Actor1Geo_CountryCode Actor2Geo_CountryCode NumMentions EventRootCode 

In [5]:
gdelt_sql.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7514833 entries, 0 to 7514832
Data columns (total 10 columns):
SQLDATE                  object
Actor1CountryCode        object
Actor2CountryCode        object
Actor1Type1Code          object
Actor2Type1Code          object
Actor1Geo_CountryCode    object
Actor2Geo_CountryCode    object
EventRootCode            object
AVG(NumMentions)         float64
AVG(AvgTone)             float64
dtypes: float64(2), object(8)
memory usage: 573.3+ MB


In [6]:
gdelt_sql['AVG(NumMentions)'].value_counts()

2.000000       1341170
10.000000       869253
1.000000        726505
4.000000        683948
6.000000        564844
5.000000        469075
3.000000        422756
8.000000        352061
20.000000       141490
12.000000       111735
7.000000        102068
2.500000         60906
16.000000        57205
9.000000         56854
15.000000        54729
1.500000         52123
30.000000        47712
3.500000         41762
18.000000        35898
7.500000         35746
4.500000         34621
14.000000        33388
5.500000         29664
24.000000        26790
11.000000        24702
6.500000         24417
40.000000        22842
3.333333         22189
13.000000        18548
2.666667         17189
                ...   
57.158730            1
1011.666667          1
48.241135            1
21.488095            1
73.823529            1
260.750000           1
20.036765            1
54.631579            1
34.083832            1
259.375000           1
9.327044             1
18.608187            1
17.246154  

In [172]:
gdelt_sql.shape

(7514833, 10)

In [173]:
gdelt_sql.to_csv("gdelt_agg_big.csv", chunksize=100, index=False)

In [174]:
gdelt_raw = pd.read_csv("gdelt_agg_big.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [175]:
gdelt = pd.read_csv("gdelt_agg_big.csv", dtype={"EventRootCode": "category",
                                               "Actor1CountryCode": "category", 
                                               "Actor2CountryCode": "category",
                                               "Actor1Geo_CountryCode": "category", 
                                               "Actor2Geo_CountryCode": "category",
                                               "Actor1Type1Code": "category",
                                               "Actor2Type1Code": "category"})

In [177]:
gdelt.columns

Index(['SQLDATE', 'Actor1CountryCode', 'Actor2CountryCode', 'Actor1Type1Code',
       'Actor2Type1Code', 'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode',
       'EventRootCode', 'AVG(NumMentions)', 'AVG(AvgTone)'],
      dtype='object')

In [178]:
interest = ["SQLDATE",
            "AvgTone", "NumMentions", "EventRootCode",
            "Actor1CountryCode", "Actor2CountryCode",
            "Actor1Geo_CountryCode", "Actor2Geo_CountryCode"]

categories = ["EventRootCode",
            "Actor1CountryCode", "Actor2CountryCode",
            "Actor1Geo_CountryCode", "Actor2Geo_CountryCode"]

gdelt = gdelt.drop(labels=gdelt[(gdelt['EventRootCode'] == "--")].index)

for category_col in categories:
    gdelt[category_col] = gdelt[category_col].astype('category') \
                                    .cat.remove_unused_categories()
    
gdelt['SQLDATE'] = pd.to_datetime(gdelt['SQLDATE'])  

In [6]:
gdelt_sample = gdelt.sample(frac=.05)

In [10]:
gdelt_tones = gdelt_sample['AVG(AvgTone)']

In [18]:
gdelt_tones = gdelt_sample['AVG(AvgTone)']
def dim_reduce(column):
    gdelt_cntry = gdelt_sample[column]

    one_hot = pd.get_dummies(gdelt_cntry)
    
    one_hot_tone = pd.concat([gdelt_tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()
    avg_avgtone_std = one_hot_tone['AVG(AvgTone)'].std()

    country_tones = []
    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        if len(country) < 10:
            #print(country)
            continue
        column_means = temp.groupby(column).mean()['AVG(AvgTone)']
        country_tones.append(column_means)
        country_info.append((column, column_means[0] - column_means[1], np.absolute(column_means[0] - column_means[1]), 
                            (temp[column].sum()), ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])

    low_decs = cntry_spec[((cntry_spec['p-value'] > 0.0001) & ((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] < 0)))]['Country']

    low_incs = cntry_spec[((cntry_spec['p-value'] > 0.0001) & ((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']

    low_p_value = cntry_spec[((cntry_spec['p-value'] > 0.0001) & ~((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] < 0)) & ~((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']
    
    return low_decs, low_incs, low_p_value

In [24]:
low_decs_A1CC, low_incs_A1CC, low_p_value_A1CC = dim_reduce('Actor1CountryCode')
low_decs_A2CC, low_incs_A2CC, low_p_value_A2CC = dim_reduce('Actor2CountryCode')
low_decs_A1GCC, low_incs_A1GCC, low_p_value_A1GCC = dim_reduce('Actor1Geo_CountryCode')
low_decs_A2GCC, low_incs_A2GCC, low_p_value_A2GCC = dim_reduce('Actor2Geo_CountryCode')

In [44]:
model_df = gdelt_sample.copy()

In [45]:
def map_missing(x, low_decs, low_incs, low_p_value, translation):
    #if x in l:
    #    return x
    if x == "nan":
        return "UNKNOWN"
    elif x in low_decs:
        return "LOW_DEC"
    elif x in low_incs:
        return "LOW_INCS"
    elif x in low_p_value:
        return "LOW_P_VALUE"
    else:
        return x

def unify_cats(data, category, low_decs, low_incs, low_p_value):
    low_decs_unique = low_decs.unique()
    low_decs_unique = low_incs.unique()
    low_p_value_unique = low_p_value.unique()
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, low_decs_unique, 
                                                                             low_decs_unique, 
                                                                             low_p_value_unique, 
                                                                             "OTHER")) \
                                                .astype('category')

In [46]:
unify_cats(model_df, 'Actor1CountryCode', low_decs_A1CC, low_incs_A1CC, low_p_value_A1CC)
unify_cats(model_df, 'Actor2CountryCode', low_decs_A2CC, low_incs_A2CC, low_p_value_A2CC)
unify_cats(model_df, 'Actor1Geo_CountryCode', low_decs_A1GCC, low_incs_A1GCC, low_p_value_A1GCC)
unify_cats(model_df, 'Actor2Geo_CountryCode', low_decs_A2GCC, low_incs_A2GCC, low_p_value_A2GCC)

In [48]:
model_df['norm_NumMentions'] = (model_df['AVG(NumMentions)'] - model_df['AVG(NumMentions)'].mean())/ \
                                        model_df['AVG(NumMentions)'].std()

In [61]:
#model_df = model_df.drop(['Actor1CountryCode', 'Actor2CountryCode', 'Actor1Geo_CountryCode', 
#               'Actor2Geo_CountryCode', 'AVG(NumMentions)', 'SQLDATE'], axis=1)

Actor1CC_one_hot = pd.get_dummies(model_df['Actor1CountryCode_unify'], prefix="Actor1CountryCode")
Actor2CC_one_hot = pd.get_dummies(model_df['Actor2CountryCode_unify'], prefix="Actor2CountryCode")
Actor1CCGeo_one_hot = pd.get_dummies(model_df['Actor1Geo_CountryCode_unify'], prefix="Actor1Geo_CountryCode")
Actor2CCGeo_one_hot = pd.get_dummies(model_df['Actor2Geo_CountryCode_unify'], prefix="Actor2Geo_CountryCode")
EventRoot_one_hot = pd.get_dummies(model_df['EventRootCode'], prefix="EventRootCode")

one_hot_encoding = pd.concat([Actor1CC_one_hot, Actor2CC_one_hot, Actor1CCGeo_one_hot, 
                              Actor2CCGeo_one_hot, EventRoot_one_hot], axis=1)

model_df_hot = pd.concat([model_df, one_hot_encoding], axis=1).drop(['Actor1CountryCode_unify',
                                                                    'Actor2CountryCode_unify',
                                                                    'Actor1Geo_CountryCode_unify',
                                                                    'Actor2Geo_CountryCode_unify'], axis=1)

In [66]:
feature_columns = model_df_hot.drop(['AVG(AvgTone)'], axis=1).columns

In [65]:
train, test = train_test_split(model_df_hot, test_size=0.2, random_state=42)

In [68]:
regr = linear_model.LinearRegression()
Y = train['AVG(AvgTone)'].values.reshape(-1, 1)
X = train[feature_columns]
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [69]:
regr.score(X, Y)

0.24057144216501747

In [72]:
from sklearn.linear_model import Ridge
regr = Ridge()
Y = train['AVG(AvgTone)'].values.reshape(-1, 1)
X = train[feature_columns]
regr.fit(X, Y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [73]:
regr.score(X, Y)

0.24050873555027785

In [6]:
def map_missing(x, l, translation):
    if x in l:
        return x
    elif x == "nan":
        return "UNKNOWN"
    else:
        return translation

def unify_rare_cats(data, category, cut_off):
    vc = data[category].value_counts()
    past_cut_off = (vc/len(data)) > cut_off
    remaining = list(vc[past_cut_off].index)
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, remaining, "OTHER")) \
                                                .astype('category')

In [64]:
gdelt_sample_rem = gdelt_sample.drop(['Actor1Geo_CountryCode', 'Actor2Geo_CountryCode', 'SQLDATE'], axis=1)

In [66]:
unify_rare_cats(gdelt_sample, 'Actor1CountryCode', .005)
unify_rare_cats(gdelt_sample, 'Actor2CountryCode', .005)
unify_rare_cats(gdelt_sample, 'Actor1Geo_CountryCode', .005)
unify_rare_cats(gdelt_sample, 'Actor2Geo_CountryCode', .005)

unify_rare_cats(gdelt_sample_rem, 'Actor1CountryCode', .005)
unify_rare_cats(gdelt_sample_rem, 'Actor2CountryCode', .005)

In [67]:
gdelt_sample['norm_NumMentions'] = (gdelt_sample['AVG(NumMentions)'] - gdelt_sample['AVG(NumMentions)'].mean())/ \
                                        gdelt_sample['AVG(NumMentions)'].std()
    
gdelt_sample_rem['norm_NumMentions'] = (gdelt_sample_rem['AVG(NumMentions)'] - gdelt_sample_rem['AVG(NumMentions)'].mean())/ \
                                        gdelt_sample_rem['AVG(NumMentions)'].std()

In [68]:
Actor1CC_one_hot = pd.get_dummies(gdelt_sample['Actor1CountryCode_unify'], prefix="Actor1CountryCode")
Actor2CC_one_hot = pd.get_dummies(gdelt_sample['Actor2CountryCode_unify'], prefix="Actor2CountryCode")
Actor1CCGeo_one_hot = pd.get_dummies(gdelt_sample['Actor1Geo_CountryCode_unify'], prefix="Actor1Geo_CountryCode")
Actor2CCGeo_one_hot = pd.get_dummies(gdelt_sample['Actor2Geo_CountryCode_unify'], prefix="Actor2Geo_CountryCode")
EventRoot_one_hot = pd.get_dummies(gdelt_sample['EventRootCode'], prefix="EventRootCode")

one_hot_encoding = pd.concat([Actor1CC_one_hot, Actor2CC_one_hot, Actor1CCGeo_one_hot, 
                              Actor2CCGeo_one_hot, EventRoot_one_hot], axis=1)

gdelt_sample = pd.concat([gdelt_sample, one_hot_encoding], axis=1)

Actor1CC_one_hot_rem = pd.get_dummies(gdelt_sample_rem['Actor1CountryCode_unify'], prefix="Actor1CountryCode")
Actor2CC_one_hot_rem = pd.get_dummies(gdelt_sample_rem['Actor2CountryCode_unify'], prefix="Actor2CountryCode")
EventRoot_one_hot_rem = pd.get_dummies(gdelt_sample_rem['EventRootCode'], prefix="EventRootCode")

one_hot_encoding_rem = pd.concat([Actor1CC_one_hot_rem, Actor2CC_one_hot_rem, EventRoot_one_hot_rem], axis=1)

gdelt_sample_rem = pd.concat([gdelt_sample_rem, one_hot_encoding_rem], axis=1)

In [56]:
gdelt_sample.shape

(218524, 202)

In [57]:
gdelt_sample_rem.shape

(218524, 103)

In [72]:
model_columns = list(one_hot_encoding.columns)
model_columns.append('norm_NumMentions')
feature_columns = model_columns.copy()
model_columns.append('AVG(AvgTone)')

model_columns_rem = list(one_hot_encoding_rem.columns)
model_columns_rem.append('norm_NumMentions')
feature_columns_rem = model_columns_rem.copy()
model_columns_rem.append('AVG(AvgTone)')

In [74]:
gdelt_sample_m = gdelt_sample[model_columns].copy()

gdelt_sample_rem_m = gdelt_sample_rem[model_columns_rem].copy()

In [75]:
train, test = train_test_split(gdelt_sample_m, test_size=0.25, random_state=42)

train_rem, test_rem = train_test_split(gdelt_sample_rem_m, test_size=0.25, random_state=42)

In [76]:
train.shape

(163893, 186)

In [77]:
train_rem.shape

(163893, 96)

In [78]:
train, test = train_test_split(gdelt_sample_m, test_size=0.25, random_state=42)

regr = linear_model.LinearRegression()
Y = train['AVG(AvgTone)']
X = train[feature_columns]
regr.fit(X, Y)

print('\nR-squared:')
print(regr.score(X, Y))


R-squared:
0.21195525887786848


In [80]:
regr = linear_model.LinearRegression()
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
regr.fit(X, Y)

print('\nR-squared:')
print(regr.score(X, Y))


R-squared:
0.19612340075559853


In [82]:
gbr = GradientBoostingRegressor(max_depth=2, verbose=True)
Y = train['AVG(AvgTone)']
X = train[feature_columns]
gbr.fit(X, Y)

print('\nR-squared:')
print(gbr.score(X, Y))

      Iter       Train Loss   Remaining Time 
         1          11.9990           20.34s
         2          11.8767           20.96s
         3          11.7423           21.06s
         4          11.6426           20.51s
         5          11.5372           20.56s
         6          11.4539           20.20s
         7          11.3692           20.15s
         8          11.2915           20.01s
         9          11.2241           19.67s
        10          11.1566           19.59s
        20          10.7166           17.05s
        30          10.4914           14.72s
        40          10.3520           12.51s
        50          10.2515           10.34s
        60          10.1728            8.22s
        70          10.1086            6.14s
        80          10.0549            4.07s
        90          10.0058            2.04s
       100           9.9647            0.00s

R-squared:
0.17984288931184345


In [83]:
gbr = GradientBoostingRegressor(max_depth=2, verbose=True)
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
gbr.fit(X, Y)

print('\nR-squared:')
print(gbr.score(X, Y))

      Iter       Train Loss   Remaining Time 
         1          11.9990           11.26s
         2          11.8767           11.19s
         3          11.7423           11.22s
         4          11.6426           10.99s
         5          11.5372           11.07s
         6          11.4539           11.00s
         7          11.3697           10.88s
         8          11.2921           10.84s
         9          11.2248           10.82s
        10          11.1579           10.88s
        20          10.7201            9.95s
        30          10.4975            8.66s
        40          10.3661            7.26s
        50          10.2721            5.96s
        60          10.2009            4.71s
        70          10.1437            3.50s
        80          10.0983            2.31s
        90          10.0602            1.15s
       100          10.0275            0.00s

R-squared:
0.17466966269757112


In [85]:
gbr = GradientBoostingRegressor(max_depth=3, verbose=True, n_estimators=250)
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
gbr.fit(X, Y)

print('\nR-squared:')
print(gbr.score(X, Y))

      Iter       Train Loss   Remaining Time 
         1          11.9388           56.12s
         2          11.7679           58.00s
         3          11.5662            1.01m
         4          11.4354           58.12s
         5          11.2817           58.91s
         6          11.1773           57.53s
         7          11.0588           57.94s
         8          10.9625           58.21s
         9          10.8883           57.18s
        10          10.8141           57.32s
        20          10.4229           52.19s
        30          10.2504           49.93s
        40          10.1417           47.97s
        50          10.0616           45.95s
        60          10.0019           45.79s
        70           9.9554           42.34s
        80           9.9176           38.86s
        90           9.8828           35.80s
       100           9.8542           33.00s
       200           9.6835           10.36s

R-squared:
0.20653569805723182


In [89]:
regr = Ridge()
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
regr.fit(X, Y)

print('\nR-squared:')
print(regr.score(X, Y))

regr.coef_


R-squared:
0.19626564733644636


array([-0.8515251 ,  0.84879249,  0.42976662,  0.58719902,  0.24300694,
        0.77631464,  0.86328756,  0.11948364, -0.57047295,  0.71796266,
        0.43715595,  0.52781539,  0.27002181, -0.08290007,  0.69548257,
       -0.69176825, -0.75654196, -0.39573903,  0.29765846, -0.43138564,
        0.82775763,  0.51355615,  0.16058804,  0.23049128,  0.17102605,
        0.31995076, -0.36282578, -0.18070882,  0.28522125, -0.77060142,
       -1.0950767 , -0.61206347, -0.25294739, -0.4342424 , -1.14120839,
       -0.29421253, -0.51348242, -0.07056624,  0.18572966, -0.68173321,
        0.9156716 ,  0.67602895,  0.08100082,  0.65746073,  0.77851383,
        0.15983953, -0.51390072,  0.61131216,  0.52273344,  0.60329033,
        0.21400534, -0.01603856,  0.62005944, -0.73601223, -0.55837229,
       -0.09700976,  0.3706473 ,  0.93742817,  0.56769648, -0.0497552 ,
        0.06074978,  0.29155451, -0.49646468, -0.48462042, -0.88252197,
       -0.93761505, -0.61455413, -0.28073163, -0.53419266, -1.13

In [100]:
gdelt_sample.columns.to_series().groupby(gdelt_sample.dtypes)

TypeError: data type not understood

In [103]:
gdelt_sample = gdelt_sample.drop(['SQLDATE'], axis=1)

In [108]:
gdelt_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218524 entries, 10869 to 2657273
Columns: 196 entries, Actor1CountryCode to EventRootCode_20
dtypes: category(9), float64(3), uint8(184)
memory usage: 47.8 MB


In [112]:
gdelt_sample['Actor1CountryCode'].dtype == 'category'

True

In [115]:
gdelt_sample.columns.drop(['Actor1CountryCode'])

Index(['Actor2CountryCode', 'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode',
       'EventRootCode', 'AVG(NumMentions)', 'AVG(AvgTone)',
       'Actor1CountryCode_unify', 'Actor2CountryCode_unify',
       'Actor1Geo_CountryCode_unify', 'Actor2Geo_CountryCode_unify',
       ...
       'EventRootCode_11', 'EventRootCode_12', 'EventRootCode_13',
       'EventRootCode_14', 'EventRootCode_15', 'EventRootCode_16',
       'EventRootCode_17', 'EventRootCode_18', 'EventRootCode_19',
       'EventRootCode_20'],
      dtype='object', length=195)

In [None]:
# TODO reduce into quantile buckets
def quantile_reduce(sample, column, pvalue, mag, count_penalty):
    tones = sample['AVG(AvgTone)']
    cntry = sample[column]

    one_hot = pd.get_dummies(cntry)
    one_hot_tone = pd.concat([tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()

    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        columns_mean = None
        if len(country) < count_penalty: 
            column_means = [0, 0]
            country_info.append((column, 
                                 0, 
                                 0, 
                                 (temp[column].sum()),
                                 1)
        else:
            column_means = temp.groupby(column).mean()['AVG(AvgTone)']
            country_info.append((column, 
                                 column_means[0] - column_means[1], 
                                 np.absolute(column_means[0] - column_means[1]), 
                                 (temp[column].sum()),
                                 ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])

    low_decs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)))]['Country']

    low_incs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']

    low_p_value = cntry_spec[((cntry_spec['p-value'] > pvalue) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']
    
    return low_decs, low_incs, low_p_value

def map_quantile(x, low_decs, low_incs, low_p_value, translation):
    if x == "nan":
        return "UNKNOWN"
    elif x in low_decs:
        return "LOW_DEC"
    elif x in low_incs:
        return "LOW_INCS"
    elif x in low_p_value:
        return "LOW_P_VALUE"
    else:
        return x

def unify_quantile_cats(data, category, pvalue, mag, low_decs, low_incs, low_p_value):
    low_decs, low_incs, low_p_value = weak_reduce(data, category, pvalue, mag)
    low_decs_unique = low_decs.unique()
    low_decs_unique = low_incs.unique()
    low_p_value_unique = low_p_value.unique()
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, low_decs_unique, 
                                                                             low_decs_unique, 
                                                                             low_p_value_unique, 
                                                                             "OTHER")) \
                                                .astype('category')

In [23]:
def weak_reduce(sample, column, pvalue, mag, count_penalty):
    tones = sample['AVG(AvgTone)']
    cntry = sample[column]

    one_hot = pd.get_dummies(cntry)
    one_hot_tone = pd.concat([tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()

    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        columns_mean = None
        if len(country) < count_penalty: 
            column_means = [0, 0]
            country_info.append((column, 
                                 0, 
                                 0, 
                                 (temp[column].sum()),
                                 1))
        else:
            column_means = temp.groupby(column).mean()['AVG(AvgTone)']
            country_info.append((column, 
                                 column_means[0] - column_means[1], 
                                 np.absolute(column_means[0] - column_means[1]), 
                                 (temp[column].sum()),
                                 ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])

    low_decs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)))]['Country']

    low_incs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']

    low_p_value = cntry_spec[((cntry_spec['p-value'] > pvalue) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']
    
    return low_decs, low_incs, low_p_value

def map_weak(x, low_decs, low_incs, low_p_value, translation):
    #if x in l:
    #    return x
    if x == "nan":
        return "UNKNOWN"
    elif x in low_decs:
        return "LOW_DEC"
    elif x in low_incs:
        return "LOW_INCS"
    elif x in low_p_value:
        return "LOW_P_VALUE"
    else:
        return x

def unify_weak_cats(data, category, pvalue, mag, count_penalty):
    low_decs, low_incs, low_p_value = weak_reduce(data, category, pvalue, mag)
    low_decs_unique = low_decs.unique()
    low_decs_unique = low_incs.unique()
    low_p_value_unique = low_p_value.unique()
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, low_decs_unique, 
                                                                             low_decs_unique, 
                                                                             low_p_value_unique, 
                                                                             "OTHER")) \
                                                .astype('category')

In [1]:
def map_rare(x, l, translation):
    if x in l:
        return x
    elif x == "nan":
        return "UNKNOWN"
    else:
        return translation

def unify_rare_cats(data, category, cut_off):
    vc = data[category].value_counts()
    past_cut_off = (vc/len(data)) > cut_off
    remaining = list(vc[past_cut_off].index)
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_rare(x, remaining, "OTHER")) \
                                                .astype('category')
            
def map_unknown(x):
    if x == 'nan':
        return "UNKNOWN"
    else:
        return x

In [179]:
def init_sample(frac): 
    gdelt_sample = gdelt.sample(frac=frac) \
                        .drop(['SQLDATE'], axis=1)
    gdelt_sample['norm_NumMentions'] = (gdelt_sample['AVG(NumMentions)'] \
                                                - gdelt_sample['AVG(NumMentions)'].mean())/ \
                                        gdelt_sample['AVG(NumMentions)'].std()
    gdelt_sample = gdelt_sample.drop(['AVG(NumMentions)'], axis=1) 
    for category_col in gdelt_sample.columns:
        if hasattr(live_samp[column], 'cat'):
            gdelt_sample[category_col] = gdelt_sample[category_col] \
                                            .cat.remove_unused_categories()
    return gdelt_sample.copy()

In [26]:
list(init_sample(frac=.05)['EventRootCode'].unique())

['11',
 '04',
 '09',
 '05',
 '02',
 '19',
 '07',
 '10',
 '03',
 '18',
 '01',
 '13',
 '08',
 '12',
 '06',
 '14',
 '16',
 '17',
 '20',
 '15']

In [180]:
def pare(sample):
    return sample.drop(['Actor1Geo_CountryCode', 'Actor2Geo_CountryCode'], axis=1)

In [28]:
hasattr(gdelt_sample['Actor1CountryCode'], 'cat')

NameError: name 'gdelt_sample' is not defined

In [59]:
pd.util.hash_pandas_object(test_samp).sum()

4495320082356185253

In [63]:
"1" in {"2": "A"}

False

In [65]:
test = {"2": "a"}
test["2"]

'a'

In [184]:
test_samp.columns

Index(['Actor1CountryCode', 'Actor2CountryCode', 'Actor1Geo_CountryCode',
       'Actor2Geo_CountryCode', 'EventRootCode', 'AVG(AvgTone)',
       'norm_NumMentions'],
      dtype='object')

In [211]:
naive_cache = {}

# https://github.com/pandas-dev/pandas/issues/8814
def train_naive(sample, model):
    h = pd.util.hash_pandas_object(sample).sum()
    model_samp, feat_cols = None, None
    if h not in naive_cache:
        live_samp = sample.copy()

        cat_dummies = []
        drop_cols = []
        for column in live_samp.columns:
            if hasattr(live_samp[column], 'cat'):
                live_samp[column] = live_samp[column].cat.add_categories(['UNK'])
                live_samp[column].fillna('UNK')
                hot = pd.get_dummies(live_samp[column], prefix=column)
                cat_dummies.append(hot)
                drop_cols.append(column)

        live_samp = live_samp.drop(drop_cols, axis=1)            

        one_hot_enc = pd.concat(cat_dummies, axis=1)

        model_samp = pd.concat([live_samp, one_hot_enc], axis=1)
        feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
        
        naive_cache[h] = (model_samp, feat_cols)
    else:
        model_samp, feat_cols = naive_cache[h]
    
    train, test = train_test_split(model_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)

    train_score = model.score(X, Y)
    Y_test = test['AVG(AvgTone)']
    X_test = test[feat_cols]
    test_score = model.score(X_test, Y_test)
    
    return train_score, test_score, model, model_samp, train, test

In [85]:
pared_cache = {}

def train_pared(sample, model):
    h = pd.util.hash_pandas_object(sample).sum()
    model_samp, feat_cols = None, None
    if h not in pared_cache:
        live_samp = sample.copy()
        live_samp = pare(live_samp)

        cat_dummies = []
        drop_cols = []
        for column in live_samp.columns:
            if hasattr(live_samp[column], 'cat'):
                hot = pd.get_dummies(live_samp[column], prefix=column)
                cat_dummies.append(hot)
                drop_cols.append(column)

        live_samp = live_samp.drop(drop_cols, axis=1)            

        one_hot_enc = pd.concat(cat_dummies, axis=1)

        model_samp = pd.concat([live_samp, one_hot_enc], axis=1)
        feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
        
        pared_cache[h] = (model_samp, feat_cols)
    else:
        model_samp, feat_cols = naive_cache[h]        

    train, test = train_test_split(model_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)

    train_score = model.score(X, Y)
    Y_test = test['AVG(AvgTone)']
    X_test = test[feat_cols]
    test_score = model.score(X_test, Y_test)
    
    return train_score, test_score, model, model_samp, train, test

In [30]:
def train_naive_URARE(sample, model, cut_off=.005):
    live_samp = sample.copy()
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if hasattr(live_samp[column], 'cat'):
            unify_rare_cats(live_samp, column, cut_off)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return train_score, test_score, model, model_samp, train, test

In [31]:
def train_pared_URARE(sample, model, cut_off=.005):
    live_samp = sample.copy()
    live_samp = pare(live_samp)
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if hasattr(live_samp[column], 'cat'):
            unify_rare_cats(live_samp, column, cut_off)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return train_score, test_score, model, model_samp, train, test

In [32]:
def train_naive_UWEAK(sample, model, pvalue=0.0001, mag=1, count_penalty=10):
    live_samp = sample.copy()
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if hasattr(live_samp[column], 'cat'):
            unify_weak_cats(live_samp, column, pvalue, mag, count_penalty)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return train_score, test_score, model, model_samp, train, test

In [33]:
def train_pared_UWEAK(sample, model, pvalue=0.0001, mag=1, count_penalty=10):
    live_samp = sample.copy()
    live_samp = pare(live_samp)
        
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if hasattr(live_samp[column], 'cat'):
            unify_weak_cats(live_samp, column, pvalue, mag, count_penalty)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return train_score, test_score, model, model_samp, train, test

In [34]:
# can be used with pared and naive
def ICM_INT_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['InternalEvent?'] = (cntry_sample['Actor2CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor2CountryCode'], axis=1)
    return cntry_sample

In [35]:
# can be used with pared and naive
def ICM_SPEC_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample = cntry_sample.drop(['Actor1CountryCode'], axis=1)
    return cntry_sample    

In [36]:
# can only be used with naive
def ICM_INT_GINT_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['InternalEvent?'] = (cntry_sample['Actor2CountryCode'] != country).astype(int)
    cntry_sample['Actor1AtHome?'] = (cntry_sample['Actor1Geo_CountryCode'] == country).astype(int)
    cntry_sample['Actor2AtActor1Home?'] = (cntry_sample['Actor2Geo_CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor2CountryCode',
                                     'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode']
                                     , axis=1)
    return cntry_sample    

In [37]:
# can only be used with naive
def ICM_INT_GSPEC_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['InternalEvent?'] = (cntry_sample['Actor2CountryCode'] != country).astype(int)
    cntry_sample['Actor2AtActor1Home?'] = (cntry_sample['Actor2Geo_CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor2CountryCode', 
                                      'Actor2Geo_CountryCode']
                                     , axis=1)
    return cntry_sample    

In [38]:
# can only be used with naive
def ICM_FULL_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample = cntry_sample.drop(['Actor1CountryCode'], axis=1)
    return cntry_sample    

In [39]:
# can only be used with naive
def ICM_PERSPEC_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['Actor1AtHome?'] = (cntry_sample['Actor1Geo_CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor1Geo_CountryCode']
                                     , axis=1)
    return cntry_sample

In [185]:
test_samp = init_sample(.1)
naive_samps = [#test_samp, 
               ICM_INT_sample(test_samp, 'USA'), ICM_SPEC_sample(test_samp, 'USA'),
              ICM_INT_GINT_sample(test_samp, 'USA'), ICM_INT_GSPEC_sample(test_samp, 'USA'),
              ICM_FULL_sample(test_samp, 'USA'), ICM_PERSPEC_sample(test_samp, 'USA')]
pared_samps = [#test_samp, 
    ICM_INT_sample(test_samp, 'USA'), ICM_SPEC_sample(test_samp, 'USA')]

In [111]:
for samp in naive_samps:
    print(samp.columns)

Index(['Actor1CountryCode', 'Actor2CountryCode', 'Actor1Geo_CountryCode',
       'Actor2Geo_CountryCode', 'EventRootCode', 'AVG(AvgTone)',
       'norm_NumMentions'],
      dtype='object')
Index(['Actor1Geo_CountryCode', 'Actor2Geo_CountryCode', 'EventRootCode',
       'AVG(AvgTone)', 'norm_NumMentions', 'InternalEvent?'],
      dtype='object')
Index(['Actor2CountryCode', 'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode',
       'EventRootCode', 'AVG(AvgTone)', 'norm_NumMentions'],
      dtype='object')
Index(['EventRootCode', 'AVG(AvgTone)', 'norm_NumMentions', 'InternalEvent?',
       'Actor1AtHome?', 'Actor2AtActor1Home?'],
      dtype='object')
Index(['Actor1Geo_CountryCode', 'EventRootCode', 'AVG(AvgTone)',
       'norm_NumMentions', 'InternalEvent?', 'Actor2AtActor1Home?'],
      dtype='object')
Index(['Actor2CountryCode', 'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode',
       'EventRootCode', 'AVG(AvgTone)', 'norm_NumMentions'],
      dtype='object')
Index(['Actor2CountryCode',

In [86]:
good1, good1_samp = None, None
regr = linear_model.LinearRegression()
for idx, samp in enumerate(naive_samps):
    regr = linear_model.LinearRegression()
    trs, tes, model, model_samp, _, _ = train_naive(samp, regr)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

0 0.22298525152863793 -10229902.521455964
1 0.2407581118726072 -252.84449704883204
2 0.1440042942614571 0.14268231685292976
3 0.1982143436425331 0.18045793529939902
4 0.2407581118726072 -252.84449704883204
5 0.21966379353406207 -393.6119756498297


In [88]:
good2, good2_samp = None, None
regr = linear_model.LinearRegression()
for idx, samp in enumerate(pared_samps):
    regr = linear_model.LinearRegression()
    trs, tes, model, model_samp, _, _ = train_pared(samp, regr)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

0 0.22298525152863793 -10229902.521455964
1 0.2407581118726072 -252.84449704883204


In [90]:
good1, good1_samp = None, None
dt = None
for idx, samp in enumerate(naive_samps):
    dt = tree.DecisionTreeRegressor(max_depth=10)
    trs, tes, model, model_samp, _, _ = train_naive(samp, dt)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

0 0.1783477185130503 0.13744424213990292
1 0.18150919714717773 0.1321105668682423
2 0.18892143167276432 0.09051210465680748
3 0.174488908638749 0.13433565886537624
4 0.18150919714717773 0.13322213520117399
5 0.17852878272762285 0.13029809613095888


In [91]:
good2, good2_samp = None, None
dt = None
for idx, samp in enumerate(pared_samps):
    dt = tree.DecisionTreeRegressor(max_depth=10)
    trs, tes, model, model_samp, _, _ = train_pared(samp, dt)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

0 0.1783477185130501 0.13692748295410295
1 0.18150919714717773 0.13035444294080967


In [51]:
good1, good1_samp = None, None
sv = svm.SVR()
for idx, samp in enumerate(naive_samps):
    sv = svm.SVR(max_iter=100, verbose=True)
    trs, tes, model, model_samp, _, _ = train_naive(samp, sv)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

[LibSVM]



0 -1.0737730560495269 -1.0829197912427508
[LibSVM]1 -0.4316803077223681 -0.4525066395461548


KeyboardInterrupt: 

In [50]:
good2, good2_samp = None, None
sv = svm.SVR()
for idx, samp in enumerate(pared_samps):
    sv = svm.SVR(max_iter=100, verbose=True)
    trs, tes, model, model_samp, _, _ = train_pared(samp, sv)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

[LibSVM]



0 -0.8138925795679746 -0.8207373363354736
[LibSVM]1 -0.9861201741513579 -1.014792952210204
[LibSVM]2 -1.0181769044433375 -1.0469360329805206


In [95]:
good1, good1_samp = None, None
rfc = None
for idx, samp in enumerate(naive_samps):
    rfc = RandomForestRegressor(max_depth=7, verbose=True)
    trs, tes, model, model_samp, _, _ = train_naive(samp, rfc)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0 0.15252194930074403 0.14014698136623127


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


1 0.15372222301813332 0.13649468698567824


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


2 0.15232871866673103 0.12718204489079854


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


3 0.15214433079368583 0.13767346055927077


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


4 0.15458167916781818 0.13656856934667994
5 0.1534034299970254 0.13564947434752417


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [96]:
good2, good2_samp = None, None
rfc = None
for idx, samp in enumerate(pared_samps):
    rfc = RandomForestRegressor(max_depth=7, verbose=True)
    trs, tes, model, model_samp, _, _ = train_pared(samp, rfc)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0 0.15283755851648906 0.13396835274818775


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.8s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


1 0.15337350943421746 0.13681772654912472


In [113]:
good1, good1_samp = None, None
gbr = None
for idx, samp in enumerate(naive_samps):
    gbr = GradientBoostingRegressor()
    trs, tes, model, model_samp, _, _ = train_naive(samp, gbr)
    print(idx, trs, tes)

0 0.18775209134502968 0.16907738989219456
1 0.19098335477552275 0.17125714024711935
2 0.14899043076301066 0.13957579670128462
3 0.17873784549162963 0.1619455108127853
4 0.19098335477552264 0.17124771660439264
5 0.18633844748041128 0.16573886486655176


In [114]:
good2, good2_samp = None, None
gbr = None
for idx, samp in enumerate(pared_samps):
    gbr = GradientBoostingRegressor()
    trs, tes, model, model_samp, _, _ = train_pared(samp, gbr)
    print(idx, trs, tes)

0 0.14899043076301066 0.13957579670128462
1 0.17349904736707877 0.15816923373293024


In [120]:
model_samp.columns

Index(['AVG(AvgTone)', 'norm_NumMentions', 'Actor2CountryCode_ABW',
       'Actor2CountryCode_AFG', 'Actor2CountryCode_AFR',
       'Actor2CountryCode_AGO', 'Actor2CountryCode_AIA',
       'Actor2CountryCode_ALB', 'Actor2CountryCode_ARE',
       'Actor2CountryCode_ARG',
       ...
       'EventRootCode_12', 'EventRootCode_13', 'EventRootCode_14',
       'EventRootCode_15', 'EventRootCode_16', 'EventRootCode_17',
       'EventRootCode_18', 'EventRootCode_19', 'EventRootCode_20',
       'EventRootCode_--'],
      dtype='object', length=243)

In [123]:
test_samp['Actor1CountryCode'].value_counts()

USA    38097
GBR    13437
CHN    11355
RUS    11321
EUR     7938
FRA     7930
DEU     7368
ISR     6470
SAU     6189
AUS     6095
IRN     5759
TUR     5737
CAN     5630
SYR     5051
AFR     5002
JPN     4606
PAK     4363
QAT     4127
ITA     4102
EGY     3709
PRK     3543
IND     3304
KOR     3283
PHL     3172
IRQ     3166
AFG     3068
ESP     2860
NGA     2819
PSE     2793
IRL     2721
       ...  
SAF       54
GNB       48
NRU       47
CAS       45
DMA       41
KIR       40
LIE       39
SUR       38
PLW       35
FSM       34
ABW       32
COM       32
CPV       32
CRB       28
MAC       24
TUV       22
AIA       22
EAF       20
STP       15
GEO       13
SMR        8
SHN        6
AND        2
SAM        1
WLF        1
ROM        1
SCN        1
CAU        1
LAM        0
PRI        0
Name: Actor1CountryCode, Length: 221, dtype: int64

In [122]:
test_samp['Actor1CountryCode'].value_counts().index

CategoricalIndex(['USA', 'GBR', 'CHN', 'RUS', 'EUR', 'FRA', 'DEU', 'ISR',
                  'SAU', 'AUS',
                  ...
                  'SMR', 'SHN', 'AND', 'SAM', 'WLF', 'ROM', 'SCN', 'CAU',
                  'LAM', 'PRI'],
                 categories=['ABW', 'AFG', 'AFR', 'AGO', 'AIA', 'ALB', 'ARE', 'ARG', ...], ordered=False, dtype='category', length=221)

In [193]:
countries = list(test_samp['Actor1CountryCode'].value_counts().index)
small_list = countries[:5]
ICM_INT_samps = []
ICM_SPEC_samps = []

for country in small_list:
    ICM_INT_samps.append(ICM_INT_sample(test_samp, country))
    ICM_SPEC_samps.append(ICM_SPEC_sample(test_samp, country))

In [198]:
test_samp["Actor1Type1Code"].unique()

[NaN, JUD, GOV, CVL, MIL, ..., UIS, INT, SET, DEV, NGM]
Length: 34
Categories (33, object): [JUD, GOV, CVL, MIL, ..., INT, SET, DEV, NGM]

In [194]:
countries

['USA',
 'GBR',
 'RUS',
 'CHN',
 'FRA',
 'ISR',
 'DEU',
 'AUS',
 'CAN',
 'EUR',
 'TUR',
 'SAU',
 'PAK',
 'IRN',
 'IND',
 'SYR',
 'NGA',
 'AFR',
 'JPN',
 'PHL',
 'ITA',
 'QAT',
 'AFG',
 'PRK',
 'EGY',
 'IRQ',
 'KOR',
 'PSE',
 'IRL',
 'ESP',
 'MEX',
 'MYS',
 'UKR',
 'JOR',
 'IDN',
 'ARE',
 'BEL',
 'POL',
 'NZL',
 'GRC',
 'VNM',
 'KEN',
 'NLD',
 'ZAF',
 'CHE',
 'GHA',
 'THA',
 'BGD',
 'BRA',
 'LBN',
 'VEN',
 'SGP',
 'LBY',
 'LKA',
 'SWE',
 'UGA',
 'AZE',
 'SDN',
 'BHR',
 'YEM',
 'CUB',
 'WST',
 'ZWE',
 'SOM',
 'NOR',
 'ARM',
 'COL',
 'TWN',
 'SEA',
 'DNK',
 'MMR',
 'PRT',
 'AUT',
 'HUN',
 'MDV',
 'VAT',
 'KWT',
 'KHM',
 'BLR',
 'ETH',
 'JAM',
 'SSD',
 'KAZ',
 'TZA',
 'CZE',
 'NPL',
 'FIN',
 'MAR',
 'PAN',
 'ARG',
 'PER',
 'ZMB',
 'CYP',
 'BGR',
 'CHL',
 'NMR',
 'TUN',
 'FJI',
 'OMN',
 'MCO',
 'RWA',
 'EST',
 'MLT',
 'LBR',
 'LTU',
 'BHS',
 'DZA',
 'ECU',
 'TTO',
 'MWI',
 'MLI',
 'GUY',
 'TCD',
 'COD',
 'BLZ',
 'HTI',
 'LUX',
 'LVA',
 'HRV',
 'UZB',
 'NAM',
 'NER',
 'SRB',
 'BWA',
 'KGZ',


In [213]:
good1, good1_samp = None, None
gbr = None
model_samp = None
for idx, samp in enumerate(ICM_INT_samps):
    gbr = GradientBoostingRegressor()
    trs, tes, model, model_samp, _, _ = train_naive(samp, gbr)
    print(idx, trs, tes)

0 0.24333228325996237 0.2262816802125384
1 0.24965027613334156 0.23185156875762714
2 0.17856855337558952 0.15920843211098767
3 0.21444089481469716 0.18064178662407737
4 0.21027126052482337 0.1438042799111917


In [214]:
list(model_samp.columns)

['AVG(AvgTone)',
 'norm_NumMentions',
 'InternalEvent?',
 'Actor1Type1Code_AGR',
 'Actor1Type1Code_BUS',
 'Actor1Type1Code_COP',
 'Actor1Type1Code_CRM',
 'Actor1Type1Code_CVL',
 'Actor1Type1Code_EDU',
 'Actor1Type1Code_ELI',
 'Actor1Type1Code_ENV',
 'Actor1Type1Code_GOV',
 'Actor1Type1Code_HLH',
 'Actor1Type1Code_HRI',
 'Actor1Type1Code_IGO',
 'Actor1Type1Code_IMG',
 'Actor1Type1Code_INS',
 'Actor1Type1Code_INT',
 'Actor1Type1Code_JUD',
 'Actor1Type1Code_LAB',
 'Actor1Type1Code_LEG',
 'Actor1Type1Code_MED',
 'Actor1Type1Code_MIL',
 'Actor1Type1Code_MNC',
 'Actor1Type1Code_NGM',
 'Actor1Type1Code_NGO',
 'Actor1Type1Code_OPP',
 'Actor1Type1Code_RAD',
 'Actor1Type1Code_REB',
 'Actor1Type1Code_REF',
 'Actor1Type1Code_SEP',
 'Actor1Type1Code_SET',
 'Actor1Type1Code_SPY',
 'Actor1Type1Code_UAF',
 'Actor1Type1Code_UIS',
 'Actor1Type1Code_DEV',
 'Actor1Type1Code_UNK',
 'Actor2Type1Code_AGR',
 'Actor2Type1Code_BUS',
 'Actor2Type1Code_COP',
 'Actor2Type1Code_CRM',
 'Actor2Type1Code_CVL',
 'Actor

In [196]:
good1, good1_samp = None, None
gbr = None
for idx, samp in enumerate(ICM_SPEC_samps):
    gbr = GradientBoostingRegressor()
    trs, tes, model, model_samp, _, _ = train_naive(samp, gbr)
    print(idx, trs, tes)

0 0.24500311564020785 0.22657179470550767
1 0.2529619848337147 0.23385525482250447
2 0.1857721200897093 0.16578582041024548
3 0.22260619485121225 0.18980500556437396
4 0.21680215723307228 0.1446967095226489


In [215]:
def country_info(sample, column, pvalue, mag, count_penalty):
    tones = sample['AVG(AvgTone)']
    cntry = sample[column]

    one_hot = pd.get_dummies(cntry)
    one_hot_tone = pd.concat([tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()

    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        columns_mean = None
        if len(country) < count_penalty: 
            column_means = [0, 0]
            country_info.append((column, 
                                 0, 
                                 0, 
                                 (temp[column].sum()),
                                 1))
        else:
            column_means = temp.groupby(column).mean()['AVG(AvgTone)']
            country_info.append((column, 
                                 column_means[0] - column_means[1], 
                                 np.absolute(column_means[0] - column_means[1]), 
                                 (temp[column].sum()),
                                 ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])
    
    return cntry_spec

In [216]:
cntry_info = country_info(test_samp, 'Actor1CountryCode', .001, 1, 1000)

In [217]:
cntry_info.sort_values(by="AvgTone_mag", ascending=False)

Unnamed: 0,Country,AvgTone_diff,AvgTone_mag,Num,p-value
198,VEN,2.895562,2.895562,1886,4.489826e-202
170,SSD,2.195552,2.195552,1068,1.624059e-72
63,FIN,-2.026532,2.026532,1008,2.936085e-71
199,VNM,-1.884847,1.884847,2636,1.339151e-136
204,YEM,1.811866,1.811866,1631,1.144110e-83
138,NZL,-1.807352,1.807352,2650,2.530839e-154
104,LBY,1.783420,1.783420,1831,1.618481e-123
21,BLR,-1.779256,1.779256,1109,5.773632e-60
151,PSE,1.772175,1.772175,4322,1.063865e-198
163,SGP,-1.771658,1.771658,1870,3.116748e-94


In [218]:
high_mag = cntry_info.sort_values(by="AvgTone_mag", ascending=False)['Country'].iloc[:10].values

In [219]:
ICM_INT_samps = []
ICM_SPEC_samps = []

for country in high_mag:
    ICM_INT_samps.append(ICM_INT_sample(test_samp, country))
    ICM_SPEC_samps.append(ICM_SPEC_sample(test_samp, country))

In [220]:
good1, good1_samp = None, None
gbr = None
for idx, samp in enumerate(ICM_INT_samps):
    gbr = GradientBoostingRegressor()
    trs, tes, model, model_samp, _, _ = train_naive(samp, gbr)
    print(idx, trs, tes)

0 0.38755695126036005 0.19258362032559118
1 0.35593512198977695 0.07819305199186188
2 0.3886114118911289 0.16646019635859388
3 0.3489402000564874 0.28924872953878444
4 0.3460912158167547 0.10566933784584799
5 0.3022041056302651 0.07750937949119319
6 0.2742283496523297 0.13862907501102684
7 0.4341013268499114 0.17203490598634838
8 0.3445674903014083 0.26128010222859055
9 0.3488075650757112 0.1425286999661225


In [182]:
np.sort(model.feature_importances_)

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [183]:
good1, good1_samp = None, None
gbr = None
for idx, samp in enumerate(ICM_SPEC_samps):
    gbr = GradientBoostingRegressor()
    trs, tes, model, model_samp, _, _ = train_naive(samp, gbr)
    print(idx, trs, tes)

0 0.39212680046226167 0.11400759684024053
1 0.3370976096876699 0.07571608692342457
2 0.3481733162458482 0.14329152666508904
3 0.19959829018828878 0.07966490031154527
4 0.3674671210299214 0.2282376459634341
5 0.38584929886402664 0.15874273199721145
6 0.44001384582061615 0.2575568333114048
7 0.3814017455169927 0.2114692143538751
8 0.38897428095868425 0.15943424925519145
9 0.3288651711964845 0.11736104563768956


naive - Actor1CC, Actor2CC, Actor1GCC, Actor2GCC, EventRootCode, NumMentions
pared - Actor1CC, Actor2CC, EventRootCode, NumMentions

w/o infrequent columns naive 
w/o infrequent columns pared

w/o low effect countries naive 
w/o low effect countries pared

pared ICM-INT - individualized country models for Actor1CC - Actor2CC international or not
pared ICM-SPEC - individualized country models for Actor1CC - Actor2CC maintains value

naive ICM-INT/ICM-SPEC - as before but with Actor1GCC and Actor2GCC unchanged
naive ICM-INT-INT - both Actor1GCC/Actor2GCC code internal or international
naive ICM-INT-SPEC - Actor1GCC maintains location of Actor1, Actor2GCC codes international

Actor2AtActor1Home vs Actor2AtHome

quantile buckets countries naive
quantile buckets countries pared