Using a dataset of your choice, select an outcome variable and then pick four or five other variables (one to two categorical, three to four continuous) to act as the basis for features. Explore the variables using the univariate and bivariate methods you've learned so far. 

Next, based on what you learned via your data exploration, create ten new features. Explain the reasoning behind each one.

Finally, use filtering methods to select the five best features and justify your choices.

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import ttest_1samp
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
import csv, sqlite3
%matplotlib inline

from GDELT_utils import GDELT_columns, usecols, dtype_dict, \
                        cameo_dict, map_cameo_to_text, \
                        state_dict, mem_usage, state_heat_map

In [2]:
con = sqlite3.connect("gdelt.db")

In [3]:
gdelt = pd.read_sql_query("SELECT SQLDATE, Actor1CountryCode, Actor2CountryCode, \
                            Actor1Geo_CountryCode, Actor2Geo_CountryCode, \
                            EventRootCode, AVG(NumMentions), AVG(AvgTone) \
                          FROM gdelt \
                          WHERE SQLDATE > \"2017-05-01\" AND SQLDATE < \"2017-08-01\" \
                          GROUP BY SQLDATE, Actor1CountryCode, Actor2CountryCode, \
                          Actor1Geo_CountryCode, Actor2Geo_CountryCode, EventRootCode", con)
# aggregate data by
# Date Actor1Code Actor2Code Actor1Geo_CountryCode Actor2Geo_CountryCode EventRootCode -> Mean AvgTone

# Features Actor1Code Actor2Code Actor1Geo_CountryCode Actor2Geo_CountryCode NumMentions EventRootCode 

In [4]:
interest = ["SQLDATE",
            "AvgTone", "NumMentions", "EventRootCode",
            "Actor1CountryCode", "Actor2CountryCode",
            "Actor1Geo_CountryCode", "Actor2Geo_CountryCode"]

categories = ["EventRootCode",
            "Actor1CountryCode", "Actor2CountryCode",
            "Actor1Geo_CountryCode", "Actor2Geo_CountryCode"]

for category_col in categories:
    gdelt[category_col] = gdelt[category_col].astype('category')
    
gdelt['SQLDATE'] = pd.to_datetime(gdelt['SQLDATE'])  

In [8]:
gdelt = gdelt.drop(labels=gdelt[(gdelt['EventRootCode'] == "--")].index)

In [63]:
gdelt_sample = gdelt.sample(frac=.05)

In [10]:
gdelt_tones = gdelt_sample['AVG(AvgTone)']

In [18]:
gdelt_tones = gdelt_sample['AVG(AvgTone)']
def dim_reduce(column):
    gdelt_cntry = gdelt_sample[column]

    one_hot = pd.get_dummies(gdelt_cntry)
    
    one_hot_tone = pd.concat([gdelt_tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()
    avg_avgtone_std = one_hot_tone['AVG(AvgTone)'].std()

    country_tones = []
    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        if len(country) < 10:
            #print(country)
            continue
        column_means = temp.groupby(column).mean()['AVG(AvgTone)']
        country_tones.append(column_means)
        country_info.append((column, column_means[0] - column_means[1], np.absolute(column_means[0] - column_means[1]), 
                            (temp[column].sum()), ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])

    low_decs = cntry_spec[((cntry_spec['p-value'] > 0.0001) & ((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] < 0)))]['Country']

    low_incs = cntry_spec[((cntry_spec['p-value'] > 0.0001) & ((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']

    low_p_value = cntry_spec[((cntry_spec['p-value'] > 0.0001) & ~((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] < 0)) & ~((cntry_spec['AvgTone_mag'] < 1) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']
    
    return low_decs, low_incs, low_p_value

In [24]:
low_decs_A1CC, low_incs_A1CC, low_p_value_A1CC = dim_reduce('Actor1CountryCode')
low_decs_A2CC, low_incs_A2CC, low_p_value_A2CC = dim_reduce('Actor2CountryCode')
low_decs_A1GCC, low_incs_A1GCC, low_p_value_A1GCC = dim_reduce('Actor1Geo_CountryCode')
low_decs_A2GCC, low_incs_A2GCC, low_p_value_A2GCC = dim_reduce('Actor2Geo_CountryCode')

In [44]:
model_df = gdelt_sample.copy()

In [45]:
def map_missing(x, low_decs, low_incs, low_p_value, translation):
    #if x in l:
    #    return x
    if x == "nan":
        return "UNKNOWN"
    elif x in low_decs:
        return "LOW_DEC"
    elif x in low_incs:
        return "LOW_INCS"
    elif x in low_p_value:
        return "LOW_P_VALUE"
    else:
        return x

def unify_cats(data, category, low_decs, low_incs, low_p_value):
    low_decs_unique = low_decs.unique()
    low_decs_unique = low_incs.unique()
    low_p_value_unique = low_p_value.unique()
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, low_decs_unique, 
                                                                             low_decs_unique, 
                                                                             low_p_value_unique, 
                                                                             "OTHER")) \
                                                .astype('category')

In [46]:
unify_cats(model_df, 'Actor1CountryCode', low_decs_A1CC, low_incs_A1CC, low_p_value_A1CC)
unify_cats(model_df, 'Actor2CountryCode', low_decs_A2CC, low_incs_A2CC, low_p_value_A2CC)
unify_cats(model_df, 'Actor1Geo_CountryCode', low_decs_A1GCC, low_incs_A1GCC, low_p_value_A1GCC)
unify_cats(model_df, 'Actor2Geo_CountryCode', low_decs_A2GCC, low_incs_A2GCC, low_p_value_A2GCC)

In [48]:
model_df['norm_NumMentions'] = (model_df['AVG(NumMentions)'] - model_df['AVG(NumMentions)'].mean())/ \
                                        model_df['AVG(NumMentions)'].std()

In [61]:
#model_df = model_df.drop(['Actor1CountryCode', 'Actor2CountryCode', 'Actor1Geo_CountryCode', 
#               'Actor2Geo_CountryCode', 'AVG(NumMentions)', 'SQLDATE'], axis=1)

Actor1CC_one_hot = pd.get_dummies(model_df['Actor1CountryCode_unify'], prefix="Actor1CountryCode")
Actor2CC_one_hot = pd.get_dummies(model_df['Actor2CountryCode_unify'], prefix="Actor2CountryCode")
Actor1CCGeo_one_hot = pd.get_dummies(model_df['Actor1Geo_CountryCode_unify'], prefix="Actor1Geo_CountryCode")
Actor2CCGeo_one_hot = pd.get_dummies(model_df['Actor2Geo_CountryCode_unify'], prefix="Actor2Geo_CountryCode")
EventRoot_one_hot = pd.get_dummies(model_df['EventRootCode'], prefix="EventRootCode")

one_hot_encoding = pd.concat([Actor1CC_one_hot, Actor2CC_one_hot, Actor1CCGeo_one_hot, 
                              Actor2CCGeo_one_hot, EventRoot_one_hot], axis=1)

model_df_hot = pd.concat([model_df, one_hot_encoding], axis=1).drop(['Actor1CountryCode_unify',
                                                                    'Actor2CountryCode_unify',
                                                                    'Actor1Geo_CountryCode_unify',
                                                                    'Actor2Geo_CountryCode_unify'], axis=1)

In [66]:
feature_columns = model_df_hot.drop(['AVG(AvgTone)'], axis=1).columns

In [65]:
train, test = train_test_split(model_df_hot, test_size=0.2, random_state=42)

In [68]:
regr = linear_model.LinearRegression()
Y = train['AVG(AvgTone)'].values.reshape(-1, 1)
X = train[feature_columns]
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [69]:
regr.score(X, Y)

0.24057144216501747

In [72]:
from sklearn.linear_model import Ridge
regr = Ridge()
Y = train['AVG(AvgTone)'].values.reshape(-1, 1)
X = train[feature_columns]
regr.fit(X, Y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [73]:
regr.score(X, Y)

0.24050873555027785

In [6]:
def map_missing(x, l, translation):
    if x in l:
        return x
    elif x == "nan":
        return "UNKNOWN"
    else:
        return translation

def unify_rare_cats(data, category, cut_off):
    vc = data[category].value_counts()
    past_cut_off = (vc/len(data)) > cut_off
    remaining = list(vc[past_cut_off].index)
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, remaining, "OTHER")) \
                                                .astype('category')

In [64]:
gdelt_sample_rem = gdelt_sample.drop(['Actor1Geo_CountryCode', 'Actor2Geo_CountryCode', 'SQLDATE'], axis=1)

In [66]:
unify_rare_cats(gdelt_sample, 'Actor1CountryCode', .005)
unify_rare_cats(gdelt_sample, 'Actor2CountryCode', .005)
unify_rare_cats(gdelt_sample, 'Actor1Geo_CountryCode', .005)
unify_rare_cats(gdelt_sample, 'Actor2Geo_CountryCode', .005)

unify_rare_cats(gdelt_sample_rem, 'Actor1CountryCode', .005)
unify_rare_cats(gdelt_sample_rem, 'Actor2CountryCode', .005)

In [67]:
gdelt_sample['norm_NumMentions'] = (gdelt_sample['AVG(NumMentions)'] - gdelt_sample['AVG(NumMentions)'].mean())/ \
                                        gdelt_sample['AVG(NumMentions)'].std()
    
gdelt_sample_rem['norm_NumMentions'] = (gdelt_sample_rem['AVG(NumMentions)'] - gdelt_sample_rem['AVG(NumMentions)'].mean())/ \
                                        gdelt_sample_rem['AVG(NumMentions)'].std()

In [68]:
Actor1CC_one_hot = pd.get_dummies(gdelt_sample['Actor1CountryCode_unify'], prefix="Actor1CountryCode")
Actor2CC_one_hot = pd.get_dummies(gdelt_sample['Actor2CountryCode_unify'], prefix="Actor2CountryCode")
Actor1CCGeo_one_hot = pd.get_dummies(gdelt_sample['Actor1Geo_CountryCode_unify'], prefix="Actor1Geo_CountryCode")
Actor2CCGeo_one_hot = pd.get_dummies(gdelt_sample['Actor2Geo_CountryCode_unify'], prefix="Actor2Geo_CountryCode")
EventRoot_one_hot = pd.get_dummies(gdelt_sample['EventRootCode'], prefix="EventRootCode")

one_hot_encoding = pd.concat([Actor1CC_one_hot, Actor2CC_one_hot, Actor1CCGeo_one_hot, 
                              Actor2CCGeo_one_hot, EventRoot_one_hot], axis=1)

gdelt_sample = pd.concat([gdelt_sample, one_hot_encoding], axis=1)

Actor1CC_one_hot_rem = pd.get_dummies(gdelt_sample_rem['Actor1CountryCode_unify'], prefix="Actor1CountryCode")
Actor2CC_one_hot_rem = pd.get_dummies(gdelt_sample_rem['Actor2CountryCode_unify'], prefix="Actor2CountryCode")
EventRoot_one_hot_rem = pd.get_dummies(gdelt_sample_rem['EventRootCode'], prefix="EventRootCode")

one_hot_encoding_rem = pd.concat([Actor1CC_one_hot_rem, Actor2CC_one_hot_rem, EventRoot_one_hot_rem], axis=1)

gdelt_sample_rem = pd.concat([gdelt_sample_rem, one_hot_encoding_rem], axis=1)

In [56]:
gdelt_sample.shape

(218524, 202)

In [57]:
gdelt_sample_rem.shape

(218524, 103)

In [72]:
model_columns = list(one_hot_encoding.columns)
model_columns.append('norm_NumMentions')
feature_columns = model_columns.copy()
model_columns.append('AVG(AvgTone)')

model_columns_rem = list(one_hot_encoding_rem.columns)
model_columns_rem.append('norm_NumMentions')
feature_columns_rem = model_columns_rem.copy()
model_columns_rem.append('AVG(AvgTone)')

In [74]:
gdelt_sample_m = gdelt_sample[model_columns].copy()

gdelt_sample_rem_m = gdelt_sample_rem[model_columns_rem].copy()

In [75]:
train, test = train_test_split(gdelt_sample_m, test_size=0.25, random_state=42)

train_rem, test_rem = train_test_split(gdelt_sample_rem_m, test_size=0.25, random_state=42)

In [76]:
train.shape

(163893, 186)

In [77]:
train_rem.shape

(163893, 96)

In [78]:
regr = linear_model.LinearRegression()
Y = train['AVG(AvgTone)']
X = train[feature_columns]
regr.fit(X, Y)

print('\nR-squared:')
print(regr.score(X, Y))


R-squared:
0.21195525887786848


In [80]:
regr = linear_model.LinearRegression()
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
regr.fit(X, Y)

print('\nR-squared:')
print(regr.score(X, Y))


R-squared:
0.19612340075559853


In [82]:
gbr = GradientBoostingRegressor(max_depth=2, verbose=True)
Y = train['AVG(AvgTone)']
X = train[feature_columns]
gbr.fit(X, Y)

print('\nR-squared:')
print(gbr.score(X, Y))

      Iter       Train Loss   Remaining Time 
         1          11.9990           20.34s
         2          11.8767           20.96s
         3          11.7423           21.06s
         4          11.6426           20.51s
         5          11.5372           20.56s
         6          11.4539           20.20s
         7          11.3692           20.15s
         8          11.2915           20.01s
         9          11.2241           19.67s
        10          11.1566           19.59s
        20          10.7166           17.05s
        30          10.4914           14.72s
        40          10.3520           12.51s
        50          10.2515           10.34s
        60          10.1728            8.22s
        70          10.1086            6.14s
        80          10.0549            4.07s
        90          10.0058            2.04s
       100           9.9647            0.00s

R-squared:
0.17984288931184345


In [83]:
gbr = GradientBoostingRegressor(max_depth=2, verbose=True)
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
gbr.fit(X, Y)

print('\nR-squared:')
print(gbr.score(X, Y))

      Iter       Train Loss   Remaining Time 
         1          11.9990           11.26s
         2          11.8767           11.19s
         3          11.7423           11.22s
         4          11.6426           10.99s
         5          11.5372           11.07s
         6          11.4539           11.00s
         7          11.3697           10.88s
         8          11.2921           10.84s
         9          11.2248           10.82s
        10          11.1579           10.88s
        20          10.7201            9.95s
        30          10.4975            8.66s
        40          10.3661            7.26s
        50          10.2721            5.96s
        60          10.2009            4.71s
        70          10.1437            3.50s
        80          10.0983            2.31s
        90          10.0602            1.15s
       100          10.0275            0.00s

R-squared:
0.17466966269757112


In [85]:
gbr = GradientBoostingRegressor(max_depth=3, verbose=True, n_estimators=250)
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
gbr.fit(X, Y)

print('\nR-squared:')
print(gbr.score(X, Y))

      Iter       Train Loss   Remaining Time 
         1          11.9388           56.12s
         2          11.7679           58.00s
         3          11.5662            1.01m
         4          11.4354           58.12s
         5          11.2817           58.91s
         6          11.1773           57.53s
         7          11.0588           57.94s
         8          10.9625           58.21s
         9          10.8883           57.18s
        10          10.8141           57.32s
        20          10.4229           52.19s
        30          10.2504           49.93s
        40          10.1417           47.97s
        50          10.0616           45.95s
        60          10.0019           45.79s
        70           9.9554           42.34s
        80           9.9176           38.86s
        90           9.8828           35.80s
       100           9.8542           33.00s
       200           9.6835           10.36s

R-squared:
0.20653569805723182


In [89]:
regr = Ridge()
Y = train_rem['AVG(AvgTone)']
X = train_rem[feature_columns_rem]
regr.fit(X, Y)

print('\nR-squared:')
print(regr.score(X, Y))

regr.coef_


R-squared:
0.19626564733644636


array([-0.8515251 ,  0.84879249,  0.42976662,  0.58719902,  0.24300694,
        0.77631464,  0.86328756,  0.11948364, -0.57047295,  0.71796266,
        0.43715595,  0.52781539,  0.27002181, -0.08290007,  0.69548257,
       -0.69176825, -0.75654196, -0.39573903,  0.29765846, -0.43138564,
        0.82775763,  0.51355615,  0.16058804,  0.23049128,  0.17102605,
        0.31995076, -0.36282578, -0.18070882,  0.28522125, -0.77060142,
       -1.0950767 , -0.61206347, -0.25294739, -0.4342424 , -1.14120839,
       -0.29421253, -0.51348242, -0.07056624,  0.18572966, -0.68173321,
        0.9156716 ,  0.67602895,  0.08100082,  0.65746073,  0.77851383,
        0.15983953, -0.51390072,  0.61131216,  0.52273344,  0.60329033,
        0.21400534, -0.01603856,  0.62005944, -0.73601223, -0.55837229,
       -0.09700976,  0.3706473 ,  0.93742817,  0.56769648, -0.0497552 ,
        0.06074978,  0.29155451, -0.49646468, -0.48462042, -0.88252197,
       -0.93761505, -0.61455413, -0.28073163, -0.53419266, -1.13

In [100]:
gdelt_sample.columns.to_series().groupby(gdelt_sample.dtypes)

TypeError: data type not understood

In [103]:
gdelt_sample = gdelt_sample.drop(['SQLDATE'], axis=1)

In [108]:
gdelt_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218524 entries, 10869 to 2657273
Columns: 196 entries, Actor1CountryCode to EventRootCode_20
dtypes: category(9), float64(3), uint8(184)
memory usage: 47.8 MB


In [112]:
gdelt_sample['Actor1CountryCode'].dtype == 'category'

True

In [115]:
gdelt_sample.columns.drop(['Actor1CountryCode'])

Index(['Actor2CountryCode', 'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode',
       'EventRootCode', 'AVG(NumMentions)', 'AVG(AvgTone)',
       'Actor1CountryCode_unify', 'Actor2CountryCode_unify',
       'Actor1Geo_CountryCode_unify', 'Actor2Geo_CountryCode_unify',
       ...
       'EventRootCode_11', 'EventRootCode_12', 'EventRootCode_13',
       'EventRootCode_14', 'EventRootCode_15', 'EventRootCode_16',
       'EventRootCode_17', 'EventRootCode_18', 'EventRootCode_19',
       'EventRootCode_20'],
      dtype='object', length=195)

In [None]:
# TODO reduce into quantile buckets
def quantile_reduce(sample, column, pvalue, mag, count_penalty):
    tones = sample['AVG(AvgTone)']
    cntry = sample[column]

    one_hot = pd.get_dummies(cntry)
    one_hot_tone = pd.concat([tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()

    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        columns_mean = None
        if len(country) < count_penalty: 
            column_means = [0, 0]
            country_info.append((column, 
                                 0, 
                                 0, 
                                 (temp[column].sum()),
                                 1)
        else:
            column_means = temp.groupby(column).mean()['AVG(AvgTone)']
            country_info.append((column, 
                                 column_means[0] - column_means[1], 
                                 np.absolute(column_means[0] - column_means[1]), 
                                 (temp[column].sum()),
                                 ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])

    low_decs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)))]['Country']

    low_incs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']

    low_p_value = cntry_spec[((cntry_spec['p-value'] > pvalue) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']
    
    return low_decs, low_incs, low_p_value

def map_quantile(x, low_decs, low_incs, low_p_value, translation):
    if x == "nan":
        return "UNKNOWN"
    elif x in low_decs:
        return "LOW_DEC"
    elif x in low_incs:
        return "LOW_INCS"
    elif x in low_p_value:
        return "LOW_P_VALUE"
    else:
        return x

def unify_quantile_cats(data, category, pvalue, mag, low_decs, low_incs, low_p_value):
    low_decs, low_incs, low_p_value = weak_reduce(data, category, pvalue, mag)
    low_decs_unique = low_decs.unique()
    low_decs_unique = low_incs.unique()
    low_p_value_unique = low_p_value.unique()
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, low_decs_unique, 
                                                                             low_decs_unique, 
                                                                             low_p_value_unique, 
                                                                             "OTHER")) \
                                                .astype('category')

In [None]:
def weak_reduce(sample, column, pvalue, mag, count_penalty):
    tones = sample['AVG(AvgTone)']
    cntry = sample[column]

    one_hot = pd.get_dummies(cntry)
    one_hot_tone = pd.concat([tones, one_hot], axis=1)

    avg_avgtone_mean = one_hot_tone['AVG(AvgTone)'].mean()

    country_info = []
    for column in one_hot.columns:
        temp = one_hot_tone[[column, 'AVG(AvgTone)']]
        country = temp[temp[column] == 1]
        columns_mean = None
        if len(country) < count_penalty: 
            column_means = [0, 0]
            country_info.append((column, 
                                 0, 
                                 0, 
                                 (temp[column].sum()),
                                 1)
        else:
            column_means = temp.groupby(column).mean()['AVG(AvgTone)']
            country_info.append((column, 
                                 column_means[0] - column_means[1], 
                                 np.absolute(column_means[0] - column_means[1]), 
                                 (temp[column].sum()),
                                 ttest_1samp(country, avg_avgtone_mean).pvalue[1]))

    cntry_spec = pd.DataFrame(country_info, columns=["Country", "AvgTone_diff", "AvgTone_mag", "Num", "p-value"])

    low_decs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)))]['Country']

    low_incs = cntry_spec[((cntry_spec['p-value'] > pvalue) & ((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']

    low_p_value = cntry_spec[((cntry_spec['p-value'] > pvalue) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] < 0)) & ~((cntry_spec['AvgTone_mag'] < mag) & \
               (cntry_spec['AvgTone_diff'] > 0)))]['Country']
    
    return low_decs, low_incs, low_p_value

def map_weak(x, low_decs, low_incs, low_p_value, translation):
    #if x in l:
    #    return x
    if x == "nan":
        return "UNKNOWN"
    elif x in low_decs:
        return "LOW_DEC"
    elif x in low_incs:
        return "LOW_INCS"
    elif x in low_p_value:
        return "LOW_P_VALUE"
    else:
        return x

def unify_weak_cats(data, category, pvalue, mag, count_penalty):
    low_decs, low_incs, low_p_value = weak_reduce(data, category, pvalue, mag)
    low_decs_unique = low_decs.unique()
    low_decs_unique = low_incs.unique()
    low_p_value_unique = low_p_value.unique()
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, low_decs_unique, 
                                                                             low_decs_unique, 
                                                                             low_p_value_unique, 
                                                                             "OTHER")) \
                                                .astype('category')

In [None]:
def map_rare(x, l, translation):
    if x in l:
        return x
    elif x == "nan":
        return "UNKNOWN"
    else:
        return translation

def unify_rare_cats(data, category, cut_off):
    vc = data[category].value_counts()
    past_cut_off = (vc/len(data)) > cut_off
    remaining = list(vc[past_cut_off].index)
    data[(category + '_unify')] = data[category].astype(str) \
                                                .apply(lambda x: map_missing(x, remaining, "OTHER")) \
                                                .astype('category')
            
def map_unknown(x):
    if x == 'nan':
        return "UNKNOWN"
    else:
        return x

In [116]:
def init_sample(frac): 
    gdelt_sample = gdelt.sample(frac=frac)
                        .drop(['SQLDATE'], axis=1)
    gdelt_sample['norm_NumMentions'] = (gdelt_sample['AVG(NumMentions)'] \
                                                - gdelt_sample['AVG(NumMentions)'].mean())/ \
                                        gdelt_sample['AVG(NumMentions)'].std()
    gdelt_sample = gdelt_sample.drop(['AVG(NumMentions)'], axis=1) 
    return gdelt_sample.copy()

In [None]:
def pare(sample):
    return sample.drop(['Actor1Geo_CountryCode', 'Actor2Geo_CountryCode'])

In [None]:
def train_naive(sample, model):
    live_samp = sample.copy()
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if live_samp[column].dtype == 'category':
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return model, model_samp, train, test

In [None]:
def train_pared(sample, model):
    live_samp = sample.copy()
    live_samp = pare(live_samp)
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if live_samp[column].dtype == 'category':
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return model, model_samp, train, test

In [None]:
def train_naive_URARE(sample, model, cut_off=.005):
    live_samp = sample.copy()
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if live_samp[column].dtype == 'category':
            unify_rare_cats(live_samp, column, cut_off)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return model, model_samp, train, test

In [None]:
def train_pared_URARE(sample, model, cut_off=.005):
    live_samp = sample.copy()
    live_samp = pare(live_samp)
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if live_samp[column].dtype == 'category':
            unify_rare_cats(live_samp, column, cut_off)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return model, model_samp, train, test

In [None]:
def train_naive_UWEAK(sample, model, pvalue=0.0001, mag=1, count_penalty=10):
    live_samp = sample.copy()
    
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if live_samp[column].dtype == 'category':
            unify_weak_cats(live_samp, column, pvalue, mag, count_penalty)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return model, model_samp, train, test

In [None]:
def train_pared_UWEAK(sample, model, pvalue=0.0001, mag=1, count_penalty=10):
    live_samp = sample.copy()
    live_samp = pare(live_samp)
        
    cat_dummies = []
    drop_cols = []
    for column in live_samp.columns:
        if live_samp[column].dtype == 'category':
            unify_weak_cats(live_samp, column, pvalue, mag, count_penalty)
            hot = pd.get_dummies(live_samp[column], prefix=column)
            cat_dummies.append(hot)
            drop_cols.append(column)
            
    live_samp.drop(drop_cols, axis=1)            
    
    one_hot_enc = pd.concat(cat_dummies, axis=1)
    
    model_samp = pd.concat([live_samp, one_hot_enc])
    feat_cols = model_samp.columns.drop(['AVG(AvgTone)'])
    
    train, test = train_test_split(live_samp, test_size=0.25, random_state=42)
    
    Y = train['AVG(AvgTone)']
    X = train[feat_cols]
    model.fit(X, Y)
    
    return model, model_samp, train, test

In [None]:
# can be used with pared and naive
def ICM_INT_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['InternalEvent?'] = (cntry_sample['Actor2CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor2CountryCode'], axis=1)
    return cntry_sample

In [None]:
# can be used with pared and naive
def ICM_SPEC_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample = cntry_sample.drop(['Actor1CountryCode'], axis=1)
    return cntry_sample    

In [None]:
# can only be used with naive
def ICM_INT_GINT_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['InternalEvent?'] = (cntry_sample['Actor2CountryCode'] != country).astype(int)
    cntry_sample['Actor1AtHome?'] = (cntry_sample['Actor1Geo_CountryCode'] == country).astype(int)
    cntry_sample['Actor2AtActor1Home?'] = (cntry_sample['Actor2Geo_CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor2CountryCode',
                                     'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode']
                                     , axis=1)
    return cntry_sample    

In [None]:
# can only be used with naive
def ICM_INT_GSPEC_sample(sample, country):
    cntry_sample = sample[sample['Actor1CountryCode'] == country].copy()
    cntry_sample['InternalEvent?'] = (cntry_sample['Actor2CountryCode'] != country).astype(int)
    cntry_sample['Actor2AtActor1Home?'] = (cntry_sample['Actor2Geo_CountryCode'] == country).astype(int)
    cntry_sample = cntry_sample.drop(['Actor1CountryCode', 'Actor2CountryCode', 
                                      'Actor2Geo_CountryCode']
                                     , axis=1)
    return cntry_sample    

naive - Actor1CC, Actor2CC, Actor1GCC, Actor2GCC, EventRootCode, NumMentions
pared - Actor1CC, Actor2CC, EventRootCode, NumMentions

w/o infrequent columns naive 
w/o infrequent columns pared

w/o low effect countries naive 
w/o low effect countries pared

pared ICM-INT - individualized country models for Actor1CC - Actor2CC international or not
pared ICM-SPEC - individualized country models for Actor1CC - Actor2CC maintains value

naive ICM-INT/ICM-SPEC - as before but with Actor1GCC and Actor2GCC unchanged
naive ICM-INT-INT - both Actor1GCC/Actor2GCC code internal or international
naive ICM-INT-SPEC - Actor1GCC maintains location of Actor1, Actor2GCC codes international

Actor2AtActor1Home vs Actor2AtHome

quantile buckets countries naive
quantile buckets countries pared