Using a dataset of your choice, select an outcome variable and then pick four or five other variables (one to two categorical, three to four continuous) to act as the basis for features. Explore the variables using the univariate and bivariate methods you've learned so far. 

Next, based on what you learned via your data exploration, create ten new features. Explain the reasoning behind each one.

Finally, use filtering methods to select the five best features and justify your choices.

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import preprocessing
%matplotlib inline

from GDELT_utils import GDELT_columns, usecols, dtype_dict, \
                        cameo_dict, map_cameo_to_text, \
                        state_dict, mem_usage, state_heat_map
        
path = "D:/GDELT-files-trim/"
nov_2016 = [path + "20161101.export.CSV.pkl", path + "20161111.export.CSV.pkl", path + "20161121.export.CSV.pkl",
            path + "20161102.export.CSV.pkl", path + "20161112.export.CSV.pkl", path + "20161122.export.CSV.pkl",
            path + "20161103.export.CSV.pkl", path + "20161113.export.CSV.pkl", path + "20161123.export.CSV.pkl",
            path + "20161104.export.CSV.pkl", path + "20161114.export.CSV.pkl", path + "20161124.export.CSV.pkl",
            path + "20161105.export.CSV.pkl", path + "20161115.export.CSV.pkl", path + "20161125.export.CSV.pkl",
            path + "20161106.export.CSV.pkl", path + "20161116.export.CSV.pkl", path + "20161126.export.CSV.pkl",
            path + "20161107.export.CSV.pkl", path + "20161117.export.CSV.pkl", path + "20161127.export.CSV.pkl",
            path + "20161108.export.CSV.pkl", path + "20161118.export.CSV.pkl", path + "20161128.export.CSV.pkl",
            path + "20161109.export.CSV.pkl", path + "20161119.export.CSV.pkl", path + "20161129.export.CSV.pkl",
            path + "20161110.export.CSV.pkl", path + "20161120.export.CSV.pkl", path + "20161130.export.CSV.pkl"]

path = "D:/GDELT-files/events/"
nov_2016 = [path + "20161101.export.CSV.zip", path + "20161111.export.CSV.zip", path + "20161121.export.CSV.zip",
            path + "20161102.export.CSV.zip", path + "20161112.export.CSV.zip", path + "20161122.export.CSV.zip",
            path + "20161103.export.CSV.zip", path + "20161113.export.CSV.zip", path + "20161123.export.CSV.zip",
            path + "20161104.export.CSV.zip", path + "20161114.export.CSV.zip", path + "20161124.export.CSV.zip",
            path + "20161105.export.CSV.zip", path + "20161115.export.CSV.zip", path + "20161125.export.CSV.zip",
            path + "20161106.export.CSV.zip", path + "20161116.export.CSV.zip", path + "20161126.export.CSV.zip",
            path + "20161107.export.CSV.zip", path + "20161117.export.CSV.zip", path + "20161127.export.CSV.zip",
            path + "20161108.export.CSV.zip", path + "20161118.export.CSV.zip", path + "20161128.export.CSV.zip",
            path + "20161109.export.CSV.zip", path + "20161119.export.CSV.zip", path + "20161129.export.CSV.zip",
            path + "20161110.export.CSV.zip", path + "20161120.export.CSV.zip", path + "20161130.export.CSV.zip"]

In [25]:
interest = ["GLOBALEVENTID", "SQLDATE",
            "AvgTone", "QuadClass", "GoldsteinScale",
            "NumMentions", "IsRootEvent", "EventRootCode",
            #"EventBaseCode", "EventCode",
            #"Actor1Code", "Actor2Code",
            "Actor1CountryCode", "Actor2CountryCode",
            "Actor1KnownGroupCode", "Actor2KnownGroupCode",
            #"Actor1EthnicCode", "Actor2EthnicCode",
            "Actor1Religion1Code", "Actor2Religion1Code",
            #"Actor1Religion2Code", "Actor2Religion2Code",
            "Actor1Type1Code", "Actor2Type1Code",
            #"Actor1Type2Code", "Actor2Type2Code",
            #"Actor1Type3Code", "Actor2Type3Code",
            "Actor1Geo_CountryCode", "Actor2Geo_CountryCode", "ActionGeo_CountryCode",
            #"Actor1Geo_ADM1Code", "Actor2Geo_ADM1Code", "ActionGeo_ADM1Code"
           ]

# aggregate data by
# Date Actor1Code Actor2Code Actor1Geo_CountryCode Actor2Geo_CountryCode EventRootCode -> Mean AvgTone

# sample on the dataframe / don't use all possible data available

# Features Actor1Code Actor2Code Actor1Geo_CountryCode Actor2Geo_CountryCode NumMentions EventRootCode 

# AvgTone/NumMentions
# AvgTone

# standardize data around mean 0 

# QuadClass -> Not Conflict (1, 2) or Conflict (3, 4)
# QuadClass -> Not Violence (1, 2, 3) or Violence (4)
# TomorrowQuadClass -> Shift quad class back a day
# NormNumMentions -> normalized for the day
# NormNumMentions -> normalized by rolling average
# NormNumMentions -> normalized by the year
# GoldsteinScale + NumMentions
# GoldsteinScale + AvgTone
# AvgTone + NumMentions
# IsRootEvent + NumMentions -> Weighted Importance

# try to predict without cameo code

# split data in missing/non-missing and compare distributions of that data

# which features are generally missing record values

# GoldsteinScale is a relationship with Cameo Code and 
# Cameo Code maps 1:1 onto quadclass

# Try to predict AvgTone
# Cameo Code 

In [26]:
categories = [ "EventRootCode",
            "Actor1CountryCode", "Actor2CountryCode",
            "Actor1KnownGroupCode", "Actor2KnownGroupCode",
            "Actor1Religion1Code", "Actor2Religion1Code",
            "Actor1Type1Code", "Actor2Type1Code",
            "Actor1Geo_CountryCode", "Actor2Geo_CountryCode", "ActionGeo_CountryCode",
           ]

In [27]:
dfs = []
for file in nov_2016:
    df = pd.read_csv(file, sep='\t', names=GDELT_columns, 
                        usecols=interest, dtype=dtype_dict, 
                        parse_dates=['SQLDATE'])
    dfs.append(df)

In [28]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226375 entries, 0 to 226374
Data columns (total 19 columns):
GLOBALEVENTID            226375 non-null uint32
SQLDATE                  226375 non-null datetime64[ns]
Actor1CountryCode        125537 non-null category
Actor1KnownGroupCode     2838 non-null category
Actor1Religion1Code      3680 non-null category
Actor1Type1Code          100103 non-null category
Actor2CountryCode        99005 non-null category
Actor2KnownGroupCode     2185 non-null category
Actor2Religion1Code      3671 non-null category
Actor2Type1Code          81567 non-null category
IsRootEvent              226375 non-null bool
EventRootCode            226375 non-null category
QuadClass                226375 non-null uint8
GoldsteinScale           226370 non-null float32
NumMentions              226375 non-null uint16
AvgTone                  226375 non-null float32
Actor1Geo_CountryCode    200003 non-null category
Actor2Geo_CountryCode    160515 non-null category
Action

In [29]:
nov_2016 = pd.concat(dfs)

In [30]:
nov_2016.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6032301 entries, 0 to 234621
Data columns (total 19 columns):
GLOBALEVENTID            uint32
SQLDATE                  datetime64[ns]
Actor1CountryCode        object
Actor1KnownGroupCode     object
Actor1Religion1Code      object
Actor1Type1Code          object
Actor2CountryCode        object
Actor2KnownGroupCode     object
Actor2Religion1Code      object
Actor2Type1Code          object
IsRootEvent              bool
EventRootCode            object
QuadClass                uint8
GoldsteinScale           float32
NumMentions              uint16
AvgTone                  float32
Actor1Geo_CountryCode    object
Actor2Geo_CountryCode    object
ActionGeo_CountryCode    object
dtypes: bool(1), datetime64[ns](1), float32(2), object(12), uint16(1), uint32(1), uint8(1)
memory usage: 3.2 GB


In [31]:
for category_col in categories:
    nov_2016[category_col] = nov_2016[category_col].astype('category')

In [47]:
nov_2016.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6032301 entries, 0 to 234621
Data columns (total 19 columns):
GLOBALEVENTID            uint32
SQLDATE                  datetime64[ns]
Actor1CountryCode        category
Actor1KnownGroupCode     category
Actor1Religion1Code      category
Actor1Type1Code          category
Actor2CountryCode        category
Actor2KnownGroupCode     category
Actor2Religion1Code      category
Actor2Type1Code          category
IsRootEvent              bool
EventRootCode            category
QuadClass                uint8
GoldsteinScale           float32
NumMentions              uint16
AvgTone                  float32
Actor1Geo_CountryCode    category
Actor2Geo_CountryCode    category
ActionGeo_CountryCode    category
dtypes: bool(1), category(12), datetime64[ns](1), float32(2), uint16(1), uint32(1), uint8(1)
memory usage: 282.0 MB


In [48]:
nov_2016.shape

(6032301, 19)

In [49]:
nov_2016.corr()

Unnamed: 0,GLOBALEVENTID,IsRootEvent,QuadClass,GoldsteinScale,NumMentions,AvgTone
GLOBALEVENTID,1.0,-0.003546,-0.002889,0.005619,-0.003755,-0.00718
IsRootEvent,-0.003546,1.0,0.008951,-0.011832,0.040962,0.049632
QuadClass,-0.002889,0.008951,1.0,-0.774402,0.008736,-0.358945
GoldsteinScale,0.005619,-0.011832,-0.774402,1.0,-0.013464,0.347634
NumMentions,-0.003755,0.040962,0.008736,-0.013464,1.0,-0.030566
AvgTone,-0.00718,0.049632,-0.358945,0.347634,-0.030566,1.0


In [50]:
nov_2016.groupby('QuadClass').mean()

Unnamed: 0_level_0,GLOBALEVENTID,IsRootEvent,GoldsteinScale,NumMentions,AvgTone
QuadClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,599266200.0,0.641602,2.445738,14.243583,-0.894462
2,599251900.0,0.635813,5.223845,14.244955,-1.607522
3,599209100.0,0.613754,-3.498103,14.489701,-2.973222
4,599267200.0,0.67194,-7.971389,16.707975,-4.698138


In [51]:
nov_2016.groupby('QuadClass').corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,AvgTone,GLOBALEVENTID,GoldsteinScale,IsRootEvent,NumMentions
QuadClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,AvgTone,1.0,-0.011323,0.15231,0.109211,-0.026461
1,GLOBALEVENTID,-0.011323,1.0,0.009457,-0.00368,-0.003557
1,GoldsteinScale,0.15231,0.009457,1.0,0.004957,-0.010022
1,IsRootEvent,0.109211,-0.00368,0.004957,1.0,0.043104
1,NumMentions,-0.026461,-0.003557,-0.010022,0.043104,1.0
2,AvgTone,1.0,0.003155,0.16998,0.065522,-0.031981
2,GLOBALEVENTID,0.003155,1.0,0.021554,-0.00252,-0.000478
2,GoldsteinScale,0.16998,0.021554,1.0,-0.040317,-0.016452
2,IsRootEvent,0.065522,-0.00252,-0.040317,1.0,0.027455
2,NumMentions,-0.031981,-0.000478,-0.016452,0.027455,1.0


In [None]:
groupby Day EventRootCode ActionGeo_CountryCode

In [52]:
nov_2016['Conflict'] = nov_2016['QuadClass'] > 2

In [53]:
nov_2016['Conflict'].value_counts()

False    4419418
True     1612883
Name: Conflict, dtype: int64

In [54]:
nov_2016['Violence'] = nov_2016['QuadClass'] == 4

In [55]:
nov_2016['Violence'].value_counts()

False    5196452
True      835849
Name: Violence, dtype: int64

In [56]:
nov_2016['GoldsteinTone'] = nov_2016['GoldsteinScale'] + nov_2016['AvgTone']

In [57]:
nov_2016.groupby('QuadClass').mean()

Unnamed: 0_level_0,GLOBALEVENTID,IsRootEvent,GoldsteinScale,NumMentions,AvgTone,Conflict,Violence,GoldsteinTone
QuadClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,599266200.0,0.641602,2.445738,14.243583,-0.894462,0.0,0.0,1.551282
2,599251900.0,0.635813,5.223845,14.244955,-1.607522,0.0,0.0,3.616324
3,599209100.0,0.613754,-3.498103,14.489701,-2.973222,1.0,0.0,-6.471289
4,599267200.0,0.67194,-7.971389,16.707975,-4.698138,1.0,1.0,-12.669526


In [21]:
nov_2016["NumMentions"]

TypeError: list indices must be integers or slices, not str

In [32]:
nov_2016['norm_NumMentions'] = (nov_2016['NumMentions'] - nov_2016['NumMentions'].mean())/ \
                                nov_2016['NumMentions'].std()

In [33]:
nov_2016['norm_NumMentions']

0        -0.130907
1        -0.056927
2        -0.130907
3        -0.081587
4        -0.081587
5        -0.155568
6        -0.056927
7         1.669284
8        -0.056927
9         1.163751
10       -0.143237
11       -0.032267
12       -0.130907
13       -0.143237
14       -0.143237
15       -0.130907
16       -0.106247
17       -0.019937
18        0.411616
19       -0.056927
20        0.128024
21       -0.106247
22        0.448606
23       -0.155568
24        0.929479
25        0.337636
26        0.633558
27        0.115694
28        0.189675
29       -0.081587
            ...   
234592   -0.130907
234593   -0.130907
234594   -0.081587
234595   -0.130907
234596   -0.106247
234597   -0.130907
234598   -0.155568
234599   -0.056927
234600   -0.130907
234601   -0.155568
234602   -0.106247
234603   -0.081587
234604   -0.081587
234605   -0.081587
234606   -0.155568
234607   -0.130907
234608   -0.056927
234609   -0.155568
234610   -0.167898
234611   -0.155568
234612   -0.130907
234613   -0.

In [None]:
agg_rows = ["SQLDATE", "Actor1Type1Code", "Actor1Type2Code",
                 "Actor1Geo_CountryCode", "Actor2Geo_CountryCode",
                 "EventRootCode"]

In [46]:
# aggregate data by
# Date Actor1Code Actor2Code Actor1Geo_CountryCode 
# Actor2Geo_CountryCode EventRootCode -> Mean AvgTone
nov_2016.groupby(["SQLDATE", "Actor1Type1Code", "Actor2Type1Code",
                 "Actor1Geo_CountryCode", "Actor2Geo_CountryCode",
                 "EventRootCode"])[["NumMentions", "AvgTone"]].mean()

Unnamed: 0_level_0,NumMentions,AvgTone
SQLDATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-11-04,7.314815,-1.597514
2006-11-05,13.537037,-2.479386
2006-11-06,23.553191,-1.826972
2006-11-07,8.250000,-0.200365
2006-11-08,28.241379,-0.955149
2006-11-09,6.076923,-1.545390
2006-11-10,6.500000,-0.487219
2006-11-11,3.962963,1.784659
2006-11-12,5.680000,-0.381812
2006-11-13,16.883333,-0.648310


In [36]:
nov_2016['weighted_NumMentions'] = nov_2016['IsRootEvent'] * 1.2 * nov_2016['NumMentions'] 

In [37]:
nov_2016['weighted_NumMentions'].head()

0     0.0
1    12.0
2     0.0
3     9.6
4     9.6
Name: weighted_NumMentions, dtype: float64

In [38]:
nov_2016['norm_weight_mentions'] = (nov_2016['weighted_NumMentions'] - nov_2016['weighted_NumMentions'].mean())/ \
                                    nov_2016['weighted_NumMentions'].std()

In [39]:
nov_2016['norm_weight_mentions']

0        -0.145423
1        -0.012868
2        -0.145423
3        -0.039379
4        -0.039379
5        -0.118912
6        -0.145423
7         1.842905
8        -0.145423
9        -0.145423
10       -0.105657
11       -0.145423
12       -0.145423
13       -0.145423
14       -0.145423
15       -0.145423
16       -0.145423
17        0.026899
18       -0.145423
19       -0.012868
20        0.185965
21       -0.065890
22       -0.145423
23       -0.145423
24        1.047574
25        0.411309
26        0.729441
27        0.172709
28       -0.145423
29       -0.145423
            ...   
234592   -0.145423
234593   -0.145423
234594   -0.145423
234595   -0.145423
234596   -0.145423
234597   -0.145423
234598   -0.145423
234599   -0.145423
234600   -0.092401
234601   -0.118912
234602   -0.145423
234603   -0.039379
234604   -0.039379
234605   -0.145423
234606   -0.145423
234607   -0.092401
234608   -0.145423
234609   -0.118912
234610   -0.132168
234611   -0.118912
234612   -0.145423
234613   -0.