In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import ttest_1samp
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn import tree
from sklearn import svm
import csv, sqlite3
%matplotlib inline

import time

from GDELT_utils import GDELT_columns, usecols, dtype_dict, \
                        cameo_dict, map_cameo_to_text, \
                        state_dict, mem_usage, state_heat_map, \
                        init_sample, \
                        ICM_INT_sample, ICM_SPEC_sample, \
                        train_naive, train_pared, \
                        train_pared_URARE, train_pared_UWEAK, \
                        country_info

In [2]:
gdelt = pd.read_csv("gdelt_agg_big.csv",
                    dtype={"EventRootCode": "category",
                           "Actor1CountryCode": "category", 
                           "Actor2CountryCode": "category",
                           "Actor1Geo_CountryCode": "category", 
                           "Actor2Geo_CountryCode": "category",
                           "Actor1Type1Code": "category",
                           "Actor2Type1Code": "category",
                           "AVG(NumMentions)": "float64",
                           "AVG(AvgTone)": "float64"},
                   parse_dates=["SQLDATE"])
gdelt = gdelt.drop(labels=gdelt[(gdelt['EventRootCode'] == "--")].index)

In [3]:
test_samp = init_sample(gdelt, .01)
naive_samps = [test_samp, ICM_INT_sample(test_samp, 'USA'), ICM_SPEC_sample(test_samp, 'USA')]
pared_samps = [test_samp, ICM_INT_sample(test_samp, 'USA'), ICM_SPEC_sample(test_samp, 'USA')]

In [4]:
good1, good1_samp = None, None
regr = linear_model.LinearRegression()
for idx, samp in enumerate(naive_samps):
    regr = linear_model.LinearRegression()
    trs, tes, model, model_samp, _, _ = train_naive(samp, regr)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

0 0.29383519699434846 -1.4092146273752219e+17
1 0.34047090791280743 -6.936461032318225e+17
2 0.3718275602793095 -3.9566541260495775e+20


In [5]:
good2, good2_samp = None, None
regr = linear_model.LinearRegression()
for idx, samp in enumerate(pared_samps):
    regr = linear_model.LinearRegression()
    trs, tes, model, model_samp, _, _ = train_pared(samp, regr)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

0 0.2741580966982158 -8.115183237732866e+16
1 0.24656641837275095 0.1547296336727716
2 0.30752949439066357 -3.239808676232731e+20


In [6]:
good1, good1_samp = None, None
dt = None
for idx, samp in enumerate(naive_samps):
    dt = tree.DecisionTreeRegressor(max_depth=10)
    trs, tes, model, model_samp, _, _ = train_naive(samp, dt)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

0 0.19628313861021374 0.17333226567732452
1 0.24202593211236478 0.09858590555389501
2 0.24261801641111336 0.12471991794039627


In [7]:
good2, good2_samp = None, None
dt = None
for idx, samp in enumerate(pared_samps):
    dt = tree.DecisionTreeRegressor(max_depth=10)
    trs, tes, model, model_samp, _, _ = train_pared(samp, dt)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

0 0.19628313861021374 0.1714599469965984
1 0.24202593211236467 0.10263845201368149
2 0.24261801641111336 0.11067761063503523


In [8]:
good1, good1_samp = None, None
sv = svm.SVR()
for idx, samp in enumerate(naive_samps):
    sv = svm.SVR(max_iter=100, verbose=True)
    trs, tes, model, model_samp, _, _ = train_naive(samp, sv)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

[LibSVM]



0 -0.454002971516422 -0.4223393579632764
[LibSVM]1 -0.44213988964612394 -0.5115034390914042
[LibSVM]2 -0.3823166895529788 -0.44419578495544143


In [9]:
good2, good2_samp = None, None
sv = svm.SVR()
for idx, samp in enumerate(pared_samps):
    sv = svm.SVR(max_iter=100, verbose=True)
    trs, tes, model, model_samp, _, _ = train_pared(samp, sv)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

[LibSVM]



0 -0.454002971516422 -0.4223393579632764
[LibSVM]1 -0.44213988964612394 -0.5115034390914042
[LibSVM]2 -0.3823166895529788 -0.44419578495544143


In [10]:
good1, good1_samp = None, None
rfc = None
for idx, samp in enumerate(naive_samps):
    rfc = RandomForestRegressor(max_depth=7)
    trs, tes, model, model_samp, _, _ = train_naive(samp, rfc)
    print(idx, trs, tes)
    if idx == 3:
        good1, good1_samp = model, model_samp

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0 0.1655744505235709 0.16198411512635835


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


1 0.21574198239401157 0.1468327130307373
2 0.20140454001248886 0.13759870508931737


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [11]:
good2, good2_samp = None, None
rfc = None
for idx, samp in enumerate(pared_samps):
    rfc = RandomForestRegressor(max_depth=7)
    trs, tes, model, model_samp, _, _ = train_pared(samp, rfc)
    print(idx, trs, tes)
    if idx == 1:
        good2, good2_samp = model, model_samp

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    8.5s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0 0.16053020778765137 0.15710196510214047


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


1 0.20581983798824355 0.13761248176145313
2 0.22066372691273872 0.14977551370050413


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [17]:
good1, good1_samp = None, None
gbr = None
for idx, samp in enumerate(naive_samps):
    gbr = GradientBoostingRegressor()
    start_time = time.time()
    trs, tes, model, model_samp, _, _ = train_naive(samp, gbr)
    print(idx, trs, tes, time.time() - start_time)

0 0.23693303395193144 0.2307851373215999 56.33863711357117
1 0.29205056078009284 0.18654139472884368 2.336773633956909
2 0.29598994599450845 0.1983531069384752 2.7521584033966064


In [19]:
good2, good2_samp = None, None
gbr = None
for idx, samp in enumerate(pared_samps):
    gbr = GradientBoostingRegressor()
    start_time = time.time()
    trs, tes, model, model_samp, _, _ = train_pared(samp, gbr)
    print(idx, trs, tes, time.time() - start_time)

0 0.23693303395193144 0.23064582472679152 54.702860832214355
1 0.29205056078009284 0.18614356662086418 1.967256784439087
2 0.29598994599450845 0.19967377650362672 2.687342643737793


In [14]:
countries = list(test_samp['Actor1CountryCode'].value_counts().index)
small_list = countries[:5]
ICM_INT_samps = []
ICM_SPEC_samps = []

for country in small_list:
    ICM_INT_samps.append(ICM_INT_sample(test_samp, country))
    ICM_SPEC_samps.append(ICM_SPEC_sample(test_samp, country))