# Initial imports and declarations
Will be expanded later to include a bevy of imports for various processing, data exploration, and modelling.

In [1]:
#core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

#post-modelling metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

In [2]:
df = pd.read_csv("protests.csv")

---
# Data Structures 
Includes function declarations, lists, dictionaries, etc. that are used later in the program.

In [3]:
response_drops = [
    '1_accomodation', '1_arrests', '1_beatings', '1_crowd dispersal', '1_ignore', '1_killings', '1_shootings',
    '2_accomodation', '2_arrests', '2_beatings', '2_crowd dispersal', '2_ignore', '2_killings', '2_shootings', 
    '3_accomodation', '3_arrests', '3_beatings', '3_crowd dispersal', '3_ignore', '3_killings', '3_shootings', 
    '4_accomodation', '4_arrests', '4_beatings', '4_crowd dispersal', '4_killings', '4_shootings', 
    '5_.', '5_accomodation', '5_arrests', '5_beatings', '5_crowd dispersal', '5_killings', '5_shootings', 
    '6_accomodation', '6_arrests', '6_beatings', '6_crowd dispersal', '6_killings', 
    '7_.', '7_accomodation', '7_arrests', '7_beatings', '7_killings'
]

demand_drops = [
    'demand1_labor wage dispute', 'demand1_land farm issue', 'demand1_police brutality', 'demand1_political behavior, process', 'demand1_price increases, tax policy', 'demand1_removal of politician', 'demand1_social restrictions', 
    'demand2_labor wage dispute', 'demand2_land farm issue', 'demand2_police brutality', 'demand2_political behavior, process', 'demand2_price increases, tax policy', 'demand2_removal of politician', 'demand2_social restrictions', 
    'demand3_labor wage dispute', 'demand3_land farm issue', 'demand3_police brutality', 'demand3_political behavior, process', 'demand3_price increases, tax policy', 'demand3_removal of politician', 'demand3_social restrictions', 
    'demand4_.', 'demand4_labor wage dispute', 'demand4_land farm issue', 'demand4_police brutality', 'demand4_political behavior, process', 'demand4_price increases, tax policy', 'demand4_removal of politician'
]

time_drops = ['startday', 'startmonth', 'startyear', 'endday', 'endmonth', 'endyear']

other_drops = [
    'id', #Not useful to prediction.
    'ccode', #Not useful to prediction.
    'protest', #All values are 1.  Is this dataset the subset of another?
    'protestnumber', # of protests per country might be useful but not in the context of incremental numbers that it's being given
    'location', #Not extremely useable given how it's already being broken by region.
    'participants_category', #Too many null values to be of great value.
]

demands = ['protesterdemand1', 'protesterdemand2', 'protesterdemand3', 'protesterdemand4']

response = ["stateresponse1", "stateresponse2", "stateresponse3", "stateresponse4", "stateresponse5", "stateresponse6", "stateresponse7"]

targets = ['y_accomodation', 'y_arrests', 'y_beatings', 'y_crowd dispersal', 'y_ignore', 'y_killings', 'y_shootings']

In [4]:
def parse_texts(x):
    x = x.lower()
    
    if x == "dozens":
        return 50
    elif x == "hundreds":
        return 500
    elif x == "thousands":
        return 5000
    elif x == "tens of thousands":
        return 50000
    elif "hundreds of thousands" in x:
        return 250000
    elif "millions" in x:
        return 2000000
    elif "million" in x:
        return 1000000
    
    
    elif "about " in x:
        return x[6:]
    elif "more than " in x:
        return x[10:]
    
    
    elif "several" in x:
        if "dozen" in x:
            return 50
        elif "hundred" in x:
            return 500
        elif "thousand" in x:
            return 5000
    
    
    elif "hundreds" in x:
        return 500
    elif "thousands" in x:
        return 5000
    
    else:
        return x
    
    
def strip_chars(x):
    banned_chars = "+s><,"
    x = "".join([c for c in x if c not in banned_chars])
    
    try:
        x = int(x)
    finally:
        return x


    
def avg_hyphen(x):
    accepted_chars = "1234567890-"
    ind = 0

    x = "".join([c for c in x if c in accepted_chars])
    
    for i in range(len(x)):
        if x[i] == "-":
            ind = i
    
    lower = x[:ind]
    upper = x[ind+1:]
    
    if (lower == "") or (upper==""):
        return np.nan
    
    return (int(lower) + int(upper)) /2
    
    
    
def map_participants(x):
    while type(x) == str:
        x = parse_texts(x)
        if type(x) == str:
            x = strip_chars(x)
        if type(x) == str:
            x = avg_hyphen(x)
        if type(x) == str:
            x = np.nan
    return x

---
# Data Cleaning
Contains blocks of code for known cleaning problems derived from any previous data exploration.

To-do:
1. Dummify and verticalize protestor demands.
2. Rectify the participants_category, participants columns
3. Get "protest length" as a feature
4. drop id, ccode, protestnumber(?), sources(?)

In [5]:
#General/Miscellaneous Cleaning

df.dropna(subset=["notes"], inplace=True) #If there are no notes, then we will not be able to predict the outcome very well.
df.dropna(subset=["participants"], inplace=True) #Participants had very few NaN values
df.dropna(subset=["sources"], inplace=True) #Sources had very few NaN values


#Miscellaneous useless feature cleaning.  See the list declaration [other_drops] in DATA STRUCTURES for additional information.
df.drop(columns=other_drops, inplace=True)


#For the 500 or so values containing NaN in protestor identity:
df.fillna(value={"protesteridentity":"unspecified"}, inplace=True)

In [6]:
#For fixing the time values such that a length of time (in days) for the protest is established as a feature, and other time features are dropped.
#Critically, the year the protest initially occured is retained in another column.

month_days = {1:0, 2:31, 3:59, 4:90, 5:120, 6:151, 7:181, 8:212, 9:243, 10:273, 11:304, 12:334}
df["protest_length"] = 0

for i in range(len(df)):
    yearday_start = month_days[df["startmonth"].iloc[i]] + df["startday"].iloc[i]
    yearday_end = month_days[df["endmonth"].iloc[i]] + df["endday"].iloc[i]
    
    difference = (yearday_end - yearday_start) + (365 * (df["endyear"].iloc[i] - df["startyear"].iloc[i]))
    
    if difference != 0:
        df["protest_length"].iloc[i] = difference
    else:
        df["protest_length"].iloc[i] = 1 #accounts for same-day protests


#Now that the length is obtained, the additional time columns can be dropped.
df.drop(columns=time_drops, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [7]:
#For fixing the Participants feature such that we have a numerical value.
#For more information, see the function map_participants() in DATA STRUCTURES.
df["participants"] = df["participants"].map(map_participants)

df.dropna(subset=["participants"], inplace=True) #150 null values remain

In [8]:
#For translating the vertical state response & the protester demands values laterally.


df = pd.get_dummies(data=df, prefix=["1", "2", "3", "4", "5", "6", "7"], columns=response)
df = pd.get_dummies(data=df, prefix=["demand1", "demand2", "demand3", "demand4"], columns=demands)


#Combining the disparate dummies into unified response columns.  
#Unfortunately there was a certain amount of manual labor involved in this due to how finicky pandas is.
df["demand_labor_wage_dispute"] = df['demand1_labor wage dispute'] + df['demand2_labor wage dispute'] + df['demand3_labor wage dispute'] + df['demand4_labor wage dispute']
df["demand_land_farm_issue"] = df['demand1_land farm issue'] + df['demand2_land farm issue'] + df['demand3_land farm issue'] + df['demand4_land farm issue']
df["demand_police_brutality"] = df['demand1_police brutality'] + df['demand2_police brutality'] + df['demand3_police brutality'] + df['demand4_police brutality']
df["demand_political_behavior_or_process"] = df['demand1_political behavior, process'] + df['demand2_political behavior, process'] + df['demand3_political behavior, process'] + df['demand4_political behavior, process']
df["demand_price_hike_or_tax_policy"] = df['demand1_price increases, tax policy'] + df['demand2_price increases, tax policy'] + df['demand3_price increases, tax policy'] + df['demand4_price increases, tax policy']
df["demand_removal_of_politician"] = df['demand1_removal of politician'] + df['demand2_removal of politician'] + df['demand3_removal of politician'] + df['demand4_removal of politician']
df["demand_social_restrictions"] = df['demand1_social restrictions'] + df['demand2_social restrictions'] + df['demand3_social restrictions']

df["y_accomodation"] = df['1_accomodation'] + df['2_accomodation'] + df['3_accomodation'] + df['4_accomodation'] + df['5_accomodation'] + df['6_accomodation'] + df['7_accomodation']
df["y_arrests"] = df['1_arrests'] + df['2_arrests'] + df['3_arrests'] + df['4_arrests'] + df['5_arrests'] + df['6_arrests'] + df['7_arrests']
df["y_beatings"] = df['1_beatings'] + df['2_beatings'] + df['3_beatings'] + df['4_beatings'] + df['5_beatings'] + df['6_beatings'] + df['7_beatings']
df["y_crowd_dispersal"] = df['1_crowd dispersal'] + df['2_crowd dispersal'] + df['3_crowd dispersal'] + df['4_crowd dispersal'] + df['5_crowd dispersal'] + df['6_crowd dispersal']
df["y_ignore"] = df['1_ignore'] + df['2_ignore'] + df['3_ignore']
df["y_killings"] = df['1_killings'] + df['2_killings'] + df['3_killings'] + df['4_killings'] + df['5_killings'] + df['6_killings'] + df['7_killings']
df["y_shootings"] = df['1_shootings'] + df['2_shootings'] + df['3_shootings'] + df['4_shootings'] + df['5_shootings']



#Getting rid of the disparate dummies now that we have unified responses.
#Dropping Oceania since not relevant/out of scope

df.drop(columns=response_drops, inplace=True)
df.drop(columns=demand_drops, inplace=True)
df = df[df['region']!='Oceania']

---
# Data exploration & analysis
Find problems to address here and then address them in the data cleaning section.  Or, create graphs or other data exploration methods here.

In [9]:
df["country"].value_counts()

United Kingdom           543
France                   527
Ireland                  428
Germany                  360
Kenya                    348
                        ... 
Serbia and Montenegro      2
Cape Verde                 2
Laos                       2
South Sudan                1
Qatar                      1
Name: country, Length: 165, dtype: int64

In [10]:
df.columns

Index(['country', 'year', 'region', 'protesterviolence', 'participants',
       'protesteridentity', 'sources', 'notes', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'y_accomodation', 'y_arrests',
       'y_beatings', 'y_crowd_dispersal', 'y_ignore', 'y_killings',
       'y_shootings'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14264 entries, 0 to 16312
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   country                               14264 non-null  object 
 1   year                                  14264 non-null  int64  
 2   region                                14264 non-null  object 
 3   protesterviolence                     14264 non-null  float64
 4   participants                          14264 non-null  float64
 5   protesteridentity                     14264 non-null  object 
 6   sources                               14264 non-null  object 
 7   notes                                 14264 non-null  object 
 8   protest_length                        14264 non-null  float64
 9   demand_labor_wage_dispute             14264 non-null  uint8  
 10  demand_land_farm_issue                14264 non-null  uint8  
 11  demand_police_b

---
# Models

Three core objectives - 
1. FeatureUnion to run a model that takes in multiple data types (that will be nested in)
2. Function that cycles through possible state responses (that will be nested in)
3. Function that cycles through possible regions or countries for possible state responses
4. And finally (4) which is permitting for time cut offs.

---

In [12]:
    X = df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']]

    y = df['y_ignore']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    get_numeric_data = FunctionTransformer(lambda df: df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']], validate=False)

    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('xg', XGBClassifier())
    ])

In [13]:
pipe.get_params()

{'memory': None,
 'steps': [('features',
   FeatureUnion(transformer_list=[('numeric_features',
                                   Pipeline(steps=[('selector',
                                                    FunctionTransformer(func=<function <lambda> at 0x7fe62923bd30>)),
                                                   ('ss', StandardScaler())])),
                                  ('text_features',
                                   Pipeline(steps=[('selector',
                                                    FunctionTransformer(func=<function <lambda> at 0x7fe6202fa1f0>)),
                                                   ('cvec',
                                                    CountVectorizer(stop_words='english'))]))])),
  ('xg',
   XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=None, gamma=None,
                 gpu_id=None, importance_type='gain', interaction_constraints=None,
          

In [14]:
possible_responses = ['y_accomodation', 'y_arrests', 'y_beatings', 'y_crowd_dispersal', 'y_ignore', 'y_killings', 'y_shootings']

region_list = dict(df["region"].value_counts()).keys()

country_list = dict(df["country"].value_counts()).keys()

### Logistic Regression

In [15]:
# https://stackoverflow.com/questions/47745288/how-to-featureunion-numerical-and-text-features-in-python-sklearn-properly
# Since we are working with both numeric and text data, we had to incorporate FeatureUnion to handle two seaparate data types.

def protest_by_the_response(df, response):
    X = df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']]

    y = df[response]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    get_numeric_data = FunctionTransformer(lambda df: df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']], validate=False)

    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('log', LogisticRegression(max_iter=5000))
    ])
    
    params = {
        'log__penalty' : ['l2', 'l1'],
#         'log__C' : [0.001, 0.01, 0,1, 1, 5],
#         'features__text_features__cvec__max_df': [0.90, 0.95],
#         'features__text_features__cvec__max_features': [None, 1000, 3000, 5000],
        'log__solver' : ['liblinear']
        
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)

    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
    
    reg_response_dict = {
        "Response": response,
        "Training score": gs.score(X_train, y_train),
        "Testing score": gs.score(X_test, y_test),
        "Baseline": baseline
    }
    
    return reg_response_dict

def responses_by_location(loc_df):
    return [protest_by_the_response(loc_df, response) for response in possible_responses]

def df_by_region(df, region):
    return df[df["region"]==region]

def df_by_country(df, country):
    return df[df["country"]==country]

for region in region_list:
    #Instead of printing, append lists of dictionaries to a dataframe
    print(f"Region: {region}")
    print(responses_by_location(df_by_region(df, region)))
    print()

Region: Europe




[{'Response': 'y_accomodation', 'Training score': 0.9952619843924192, 'Testing score': 0.9189640768588136, 'Baseline': 0.9220480668756531}, {'Response': 'y_arrests', 'Training score': 0.9779821627647715, 'Testing score': 0.9189640768588136, 'Baseline': 0.8854754440961338}, {'Response': 'y_beatings', 'Training score': 0.9913600891861761, 'Testing score': 0.9649122807017544, 'Baseline': 0.967816091954023}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9738015607580826, 'Testing score': 0.8863826232247285, 'Baseline': 0.7684430512016719}, {'Response': 'y_ignore', 'Training score': 0.9885730211817169, 'Testing score': 0.8295739348370927, 'Baseline': 0.6764890282131661}, {'Response': 'y_killings', 'Training score': 0.999721293199554, 'Testing score': 0.9933166248955723, 'Baseline': 0.9928944618599791}, {'Response': 'y_shootings', 'Training score': 0.9988851727982163, 'Testing score': 0.9916457811194653, 'Baseline': 0.9914315569487984}]

Region: Africa




[{'Response': 'y_accomodation', 'Training score': 0.9977954144620811, 'Testing score': 0.8890356671070013, 'Baseline': 0.8733884297520661}, {'Response': 'y_arrests', 'Training score': 0.972663139329806, 'Testing score': 0.9009247027741083, 'Baseline': 0.8320661157024793}, {'Response': 'y_beatings', 'Training score': 0.9805996472663139, 'Testing score': 0.9379128137384413, 'Baseline': 0.9226446280991736}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9977954144620811, 'Testing score': 0.8137384412153237, 'Baseline': 0.5639669421487603}, {'Response': 'y_ignore', 'Training score': 0.9748677248677249, 'Testing score': 0.7820343461030383, 'Baseline': 0.5857851239669422}, {'Response': 'y_killings', 'Training score': 0.9991181657848325, 'Testing score': 0.9273447820343461, 'Baseline': 0.8981818181818182}, {'Response': 'y_shootings', 'Training score': 0.9986772486772487, 'Testing score': 0.9075297225891678, 'Baseline': 0.871404958677686}]

Region: Asia




[{'Response': 'y_accomodation', 'Training score': 0.9439295644114921, 'Testing score': 0.8833333333333333, 'Baseline': 0.8756080611535789}, {'Response': 'y_arrests', 'Training score': 0.9295644114921223, 'Testing score': 0.8472222222222222, 'Baseline': 0.8485059068797776}, {'Response': 'y_beatings', 'Training score': 0.9768303985171455, 'Testing score': 0.9333333333333333, 'Baseline': 0.9204308547602502}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9647822057460612, 'Testing score': 0.7819444444444444, 'Baseline': 0.7029186935371786}, {'Response': 'y_ignore', 'Training score': 0.9629286376274329, 'Testing score': 0.7277777777777777, 'Baseline': 0.5059068797776234}, {'Response': 'y_killings', 'Training score': 0.9745134383688601, 'Testing score': 0.9402777777777778, 'Baseline': 0.9444058373870744}, {'Response': 'y_shootings', 'Training score': 0.9772937905468025, 'Testing score': 0.9486111111111111, 'Baseline': 0.9503127171646977}]

Region: South America




[{'Response': 'y_accomodation', 'Training score': 0.9525021204410518, 'Testing score': 0.8854961832061069, 'Baseline': 0.8867684478371501}, {'Response': 'y_arrests', 'Training score': 0.9745547073791349, 'Testing score': 0.8651399491094147, 'Baseline': 0.8842239185750637}, {'Response': 'y_beatings', 'Training score': 0.993214588634436, 'Testing score': 0.9821882951653944, 'Baseline': 0.9872773536895675}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9100932994062765, 'Testing score': 0.8040712468193384, 'Baseline': 0.6762086513994912}, {'Response': 'y_ignore', 'Training score': 0.8736217133163698, 'Testing score': 0.7557251908396947, 'Baseline': 0.5674300254452926}, {'Response': 'y_killings', 'Training score': 0.9711620016963528, 'Testing score': 0.9312977099236641, 'Baseline': 0.9472010178117048}, {'Response': 'y_shootings', 'Training score': 0.9957591178965225, 'Testing score': 0.9465648854961832, 'Baseline': 0.9713740458015268}]

Region: MENA




[{'Response': 'y_accomodation', 'Training score': 0.9889025893958077, 'Testing score': 0.8966789667896679, 'Baseline': 0.9168207024029575}, {'Response': 'y_arrests', 'Training score': 0.9852034525277436, 'Testing score': 0.8560885608856088, 'Baseline': 0.8428835489833642}, {'Response': 'y_beatings', 'Training score': 0.969173859432799, 'Testing score': 0.8966789667896679, 'Baseline': 0.9131238447319778}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9815043156596794, 'Testing score': 0.7564575645756457, 'Baseline': 0.6913123844731978}, {'Response': 'y_ignore', 'Training score': 0.93711467324291, 'Testing score': 0.7822878228782287, 'Baseline': 0.5600739371534196}, {'Response': 'y_killings', 'Training score': 0.9938347718865598, 'Testing score': 0.8856088560885609, 'Baseline': 0.8826247689463955}, {'Response': 'y_shootings', 'Training score': 0.9889025893958077, 'Testing score': 0.8671586715867159, 'Baseline': 0.8706099815157117}]

Region: North America
[{'Response': 'y_accomoda

---

## Random Forest

In [16]:
def protest_by_the_response(df, response):
    X = df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']]

    y = df[response]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    get_numeric_data = FunctionTransformer(lambda df: df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']], validate=False)

    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('rf', RandomForestClassifier())
    ])
    
    params = {
        'rf__ccp_alpha' : [0.001, 0.01, 0.1, 1, 5],
#         'rf__n_estimators' : [100, 300, 500],
#         'rf__max_depth' : [None, 1, 2, 3],
#         'rf__min_samples_split' : [2, 3, 4],
#         'rf__min_samples_leaf' : [1, 2, 3]
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)

    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
    
    reg_response_dict = {
        "Response": response,
        "Training score": gs.score(X_train, y_train),
        "Testing score": gs.score(X_test, y_test),
        "Baseline": baseline
    }
    
    return reg_response_dict

def responses_by_location(loc_df):
    return [protest_by_the_response(loc_df, response) for response in possible_responses]

def df_by_region(df, region):
    return df[df["region"]==region]

def df_by_country(df, country):
    return df[df["country"]==country]

for region in region_list:
    #Instead of printing, append lists of dictionaries to a dataframe
    print(f"Region: {region}")
    print(responses_by_location(df_by_region(df, region)))
    print()

Region: Europe




[{'Response': 'y_accomodation', 'Training score': 0.9292084726867336, 'Testing score': 0.9298245614035088, 'Baseline': 0.9220480668756531}, {'Response': 'y_arrests', 'Training score': 0.899108138238573, 'Testing score': 0.8847117794486216, 'Baseline': 0.8854754440961338}, {'Response': 'y_beatings', 'Training score': 0.967391304347826, 'Testing score': 0.9699248120300752, 'Baseline': 0.967816091954023}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9041248606465998, 'Testing score': 0.8671679197994987, 'Baseline': 0.7684430512016719}, {'Response': 'y_ignore', 'Training score': 0.8729096989966555, 'Testing score': 0.8387635756056808, 'Baseline': 0.6764890282131661}, {'Response': 'y_killings', 'Training score': 0.9933110367892977, 'Testing score': 0.9916457811194653, 'Baseline': 0.9928944618599791}, {'Response': 'y_shootings', 'Training score': 0.9908026755852842, 'Testing score': 0.9933166248955723, 'Baseline': 0.9914315569487984}]

Region: Africa




[{'Response': 'y_accomodation', 'Training score': 0.9215167548500882, 'Testing score': 0.8824306472919419, 'Baseline': 0.8733884297520661}, {'Response': 'y_arrests', 'Training score': 0.9021164021164021, 'Testing score': 0.8295904887714664, 'Baseline': 0.8320661157024793}, {'Response': 'y_beatings', 'Training score': 0.9347442680776014, 'Testing score': 0.9194187582562747, 'Baseline': 0.9226446280991736}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9210758377425045, 'Testing score': 0.8309114927344782, 'Baseline': 0.5639669421487603}, {'Response': 'y_ignore', 'Training score': 0.9241622574955908, 'Testing score': 0.8203434610303831, 'Baseline': 0.5857851239669422}, {'Response': 'y_killings', 'Training score': 0.9320987654320988, 'Testing score': 0.9194187582562747, 'Baseline': 0.8981818181818182}, {'Response': 'y_shootings', 'Training score': 0.9188712522045855, 'Testing score': 0.8745046235138706, 'Baseline': 0.871404958677686}]

Region: Asia




[{'Response': 'y_accomodation', 'Training score': 0.9036144578313253, 'Testing score': 0.8736111111111111, 'Baseline': 0.8756080611535789}, {'Response': 'y_arrests', 'Training score': 0.8725671918443003, 'Testing score': 0.875, 'Baseline': 0.8485059068797776}, {'Response': 'y_beatings', 'Training score': 0.928174235403151, 'Testing score': 0.9277777777777778, 'Baseline': 0.9204308547602502}, {'Response': 'y_crowd_dispersal', 'Training score': 0.891566265060241, 'Testing score': 0.7958333333333333, 'Baseline': 0.7029186935371786}, {'Response': 'y_ignore', 'Training score': 0.8883225208526413, 'Testing score': 0.7569444444444444, 'Baseline': 0.5059068797776234}, {'Response': 'y_killings', 'Training score': 0.9531974050046339, 'Testing score': 0.9402777777777778, 'Baseline': 0.9444058373870744}, {'Response': 'y_shootings', 'Training score': 0.9531974050046339, 'Testing score': 0.9416666666666667, 'Baseline': 0.9503127171646977}]

Region: South America




[{'Response': 'y_accomodation', 'Training score': 0.9219677692960135, 'Testing score': 0.8905852417302799, 'Baseline': 0.8867684478371501}, {'Response': 'y_arrests', 'Training score': 0.9372349448685326, 'Testing score': 0.8702290076335878, 'Baseline': 0.8842239185750637}, {'Response': 'y_beatings', 'Training score': 0.9881255301102629, 'Testing score': 0.9923664122137404, 'Baseline': 0.9872773536895675}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9321458863443596, 'Testing score': 0.8473282442748091, 'Baseline': 0.6762086513994912}, {'Response': 'y_ignore', 'Training score': 0.9296013570822731, 'Testing score': 0.8244274809160306, 'Baseline': 0.5674300254452926}, {'Response': 'y_killings', 'Training score': 0.9626802374893978, 'Testing score': 0.9465648854961832, 'Baseline': 0.9472010178117048}, {'Response': 'y_shootings', 'Training score': 0.9770992366412213, 'Testing score': 0.9694656488549618, 'Baseline': 0.9713740458015268}]

Region: MENA




[{'Response': 'y_accomodation', 'Training score': 0.969173859432799, 'Testing score': 0.9188191881918819, 'Baseline': 0.9168207024029575}, {'Response': 'y_arrests', 'Training score': 0.967940813810111, 'Testing score': 0.8745387453874539, 'Baseline': 0.8428835489833642}, {'Response': 'y_beatings', 'Training score': 0.9099876695437731, 'Testing score': 0.922509225092251, 'Baseline': 0.9131238447319778}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9630086313193588, 'Testing score': 0.7896678966789668, 'Baseline': 0.6913123844731978}, {'Response': 'y_ignore', 'Training score': 0.9926017262638718, 'Testing score': 0.7859778597785978, 'Baseline': 0.5600739371534196}, {'Response': 'y_killings', 'Training score': 0.9630086313193588, 'Testing score': 0.8634686346863468, 'Baseline': 0.8826247689463955}, {'Response': 'y_shootings', 'Training score': 0.9568434032059187, 'Testing score': 0.8450184501845018, 'Baseline': 0.8706099815157117}]

Region: North America
[{'Response': 'y_accomoda

## SVC

In [17]:
def protest_by_the_response(df, response):
    X = df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']]

    y = df[response]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    get_numeric_data = FunctionTransformer(lambda df: df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']], validate=False)

    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('svc', SVC())
    ])
    
    params = {
        'svc__C' : [0.001, 0.01, 0.1, 1, 5],
        'svc__degree' :[2,3,4],
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)

    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
    
    reg_response_dict = {
        "Response": response,
        "Training score": gs.score(X_train, y_train),
        "Testing score": gs.score(X_test, y_test),
        "Baseline": baseline
    }
    
    return reg_response_dict

def responses_by_location(loc_df):
    return [protest_by_the_response(loc_df, response) for response in possible_responses]

def df_by_region(df, region):
    return df[df["region"]==region]

def df_by_country(df, country):
    return df[df["country"]==country]

for region in region_list:
    #Instead of printing, append lists of dictionaries to a dataframe
    print(f"Region: {region}")
    print(responses_by_location(df_by_region(df, region)))
    print()

Region: Europe




[{'Response': 'y_accomodation', 'Training score': 0.9952619843924192, 'Testing score': 0.9398496240601504, 'Baseline': 0.9220480668756531}, {'Response': 'y_arrests', 'Training score': 0.9896878483835005, 'Testing score': 0.9214703425229741, 'Baseline': 0.8854754440961338}, {'Response': 'y_beatings', 'Training score': 0.9969342251950948, 'Testing score': 0.9690893901420217, 'Baseline': 0.967816091954023}, {'Response': 'y_crowd_dispersal', 'Training score': 0.991917502787068, 'Testing score': 0.8964076858813701, 'Baseline': 0.7684430512016719}, {'Response': 'y_ignore', 'Training score': 0.9835562987736901, 'Testing score': 0.8437761069340016, 'Baseline': 0.6764890282131661}, {'Response': 'y_killings', 'Training score': 0.9927536231884058, 'Testing score': 0.9933166248955723, 'Baseline': 0.9928944618599791}, {'Response': 'y_shootings', 'Training score': 0.9983277591973244, 'Testing score': 0.9908103592314118, 'Baseline': 0.9914315569487984}]

Region: Africa




[{'Response': 'y_accomodation', 'Training score': 0.9951499118165785, 'Testing score': 0.8863936591809776, 'Baseline': 0.8733884297520661}, {'Response': 'y_arrests', 'Training score': 0.9916225749559083, 'Testing score': 0.9101717305151915, 'Baseline': 0.8320661157024793}, {'Response': 'y_beatings', 'Training score': 0.996031746031746, 'Testing score': 0.9326287978863936, 'Baseline': 0.9226446280991736}, {'Response': 'y_crowd_dispersal', 'Training score': 0.941358024691358, 'Testing score': 0.8243064729194187, 'Baseline': 0.5639669421487603}, {'Response': 'y_ignore', 'Training score': 0.9947089947089947, 'Testing score': 0.7833553500660502, 'Baseline': 0.5857851239669422}, {'Response': 'y_killings', 'Training score': 0.9964726631393298, 'Testing score': 0.9220607661822986, 'Baseline': 0.8981818181818182}, {'Response': 'y_shootings', 'Training score': 0.9955908289241623, 'Testing score': 0.8877146631439894, 'Baseline': 0.871404958677686}]

Region: Asia




[{'Response': 'y_accomodation', 'Training score': 0.9666357738646896, 'Testing score': 0.8972222222222223, 'Baseline': 0.8756080611535789}, {'Response': 'y_arrests', 'Training score': 0.9527340129749768, 'Testing score': 0.8597222222222223, 'Baseline': 0.8485059068797776}, {'Response': 'y_beatings', 'Training score': 0.9777571825764597, 'Testing score': 0.9375, 'Baseline': 0.9204308547602502}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9596848934198332, 'Testing score': 0.7958333333333333, 'Baseline': 0.7029186935371786}, {'Response': 'y_ignore', 'Training score': 0.9337349397590361, 'Testing score': 0.8041666666666667, 'Baseline': 0.5059068797776234}, {'Response': 'y_killings', 'Training score': 0.9443929564411492, 'Testing score': 0.9444444444444444, 'Baseline': 0.9444058373870744}, {'Response': 'y_shootings', 'Training score': 0.9485634847080631, 'Testing score': 0.9555555555555556, 'Baseline': 0.9503127171646977}]

Region: South America




[{'Response': 'y_accomodation', 'Training score': 0.9491094147582697, 'Testing score': 0.9007633587786259, 'Baseline': 0.8867684478371501}, {'Response': 'y_arrests', 'Training score': 0.9618320610687023, 'Testing score': 0.8982188295165394, 'Baseline': 0.8842239185750637}, {'Response': 'y_beatings', 'Training score': 0.9864291772688719, 'Testing score': 0.989821882951654, 'Baseline': 0.9872773536895675}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9016115351993215, 'Testing score': 0.8091603053435115, 'Baseline': 0.6762086513994912}, {'Response': 'y_ignore', 'Training score': 0.910941475826972, 'Testing score': 0.7837150127226463, 'Baseline': 0.5674300254452926}, {'Response': 'y_killings', 'Training score': 0.9397794741306191, 'Testing score': 0.9694656488549618, 'Baseline': 0.9472010178117048}, {'Response': 'y_shootings', 'Training score': 0.9711620016963528, 'Testing score': 0.9720101781170484, 'Baseline': 0.9713740458015268}]

Region: MENA




[{'Response': 'y_accomodation', 'Training score': 0.9161528976572133, 'Testing score': 0.922509225092251, 'Baseline': 0.9168207024029575}, {'Response': 'y_arrests', 'Training score': 0.9815043156596794, 'Testing score': 0.8523985239852399, 'Baseline': 0.8428835489833642}, {'Response': 'y_beatings', 'Training score': 0.9802712700369913, 'Testing score': 0.8966789667896679, 'Baseline': 0.9131238447319778}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9728729963008631, 'Testing score': 0.8007380073800738, 'Baseline': 0.6913123844731978}, {'Response': 'y_ignore', 'Training score': 0.9778051787916153, 'Testing score': 0.7712177121771218, 'Baseline': 0.5600739371534196}, {'Response': 'y_killings', 'Training score': 0.9901356350184957, 'Testing score': 0.8892988929889298, 'Baseline': 0.8826247689463955}, {'Response': 'y_shootings', 'Training score': 0.9753390875462392, 'Testing score': 0.9003690036900369, 'Baseline': 0.8706099815157117}]

Region: North America
[{'Response': 'y_accomo

## XGB Classifier

In [18]:
# 'xg__objective': 'binary:logistic',
#  'xg__base_score': None,
#  'xg__booster': None,
#  'xg__colsample_bylevel': None,
#  'xg__colsample_bynode': None,
#  'xg__colsample_bytree': None,
#  'xg__gamma': None,
#  'xg__gpu_id': None,
#  'xg__importance_type': 'gain',
#  'xg__interaction_constraints': None,
#  'xg__learning_rate': None,
#  'xg__max_delta_step': None,
#  'xg__max_depth': None,
#  'xg__min_child_weight': None,
#  'xg__missing': nan,
#  'xg__monotone_constraints': None,
#  'xg__n_estimators': 100,
#  'xg__n_jobs': None,
#  'xg__num_parallel_tree': None,
#  'xg__random_state': None,
#  'xg__reg_alpha': None,
#  'xg__reg_lambda': None,
#  'xg__scale_pos_weight': None,
#  'xg__subsample': None,
#  'xg__tree_method': None,
#  'xg__validate_parameters': None,
#  'xg__verbosity': None}


In [19]:
def protest_by_the_response(df, response):
    X = df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']]

    y = df[response]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    get_numeric_data = FunctionTransformer(lambda df: df[['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']], validate=False)

    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('xg', XGBClassifier())
    ])
    
    params = {
        'xg__gamma' : [0.001, 0.01, 0.1, 1, 5],
        'xg__max_depth' :[None, 2, 3],
        'xg__learning_rate' : [0.001, 0.01, 0.1, 1, 5]
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)

    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
    
    reg_response_dict = {
        "Response": response,
        "Training score": gs.score(X_train, y_train),
        "Testing score": gs.score(X_test, y_test),
        "Baseline": baseline
    }
    
    return reg_response_dict

def responses_by_location(loc_df):
    return [protest_by_the_response(loc_df, response) for response in possible_responses]

def df_by_region(df, region):
    return df[df["region"]==region]

def df_by_country(df, country):
    return df[df["country"]==country]

for region in region_list:
    #Instead of printing, append lists of dictionaries to a dataframe
    print(f"Region: {region}")
    print(responses_by_location(df_by_region(df, region)))
    print()

Region: Europe




[{'Response': 'y_accomodation', 'Training score': 0.9431438127090301, 'Testing score': 0.9289891395154553, 'Baseline': 0.9220480668756531}, {'Response': 'y_arrests', 'Training score': 0.9526198439241917, 'Testing score': 0.9373433583959899, 'Baseline': 0.8854754440961338}, {'Response': 'y_beatings', 'Training score': 0.9986064659977704, 'Testing score': 0.9715956558061821, 'Baseline': 0.967816091954023}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9629319955406912, 'Testing score': 0.8897243107769424, 'Baseline': 0.7684430512016719}, {'Response': 'y_ignore', 'Training score': 0.9311594202898551, 'Testing score': 0.8529657477025898, 'Baseline': 0.6764890282131661}, {'Response': 'y_killings', 'Training score': 0.9955406911928651, 'Testing score': 0.9866332497911445, 'Baseline': 0.9928944618599791}, {'Response': 'y_shootings', 'Training score': 0.999721293199554, 'Testing score': 0.9908103592314118, 'Baseline': 0.9914315569487984}]

Region: Africa




[{'Response': 'y_accomodation', 'Training score': 0.9761904761904762, 'Testing score': 0.8877146631439894, 'Baseline': 0.8733884297520661}, {'Response': 'y_arrests', 'Training score': 0.9722222222222222, 'Testing score': 0.9075297225891678, 'Baseline': 0.8320661157024793}, {'Response': 'y_beatings', 'Training score': 0.9854497354497355, 'Testing score': 0.9299867899603699, 'Baseline': 0.9226446280991736}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9651675485008818, 'Testing score': 0.8243064729194187, 'Baseline': 0.5639669421487603}, {'Response': 'y_ignore', 'Training score': 0.8619929453262787, 'Testing score': 0.8150594451783355, 'Baseline': 0.5857851239669422}, {'Response': 'y_killings', 'Training score': 1.0, 'Testing score': 0.9247027741083224, 'Baseline': 0.8981818181818182}, {'Response': 'y_shootings', 'Training score': 0.9554673721340388, 'Testing score': 0.9009247027741083, 'Baseline': 0.871404958677686}]

Region: Asia




[{'Response': 'y_accomodation', 'Training score': 0.9443929564411492, 'Testing score': 0.8694444444444445, 'Baseline': 0.8756080611535789}, {'Response': 'y_arrests', 'Training score': 0.8739573679332715, 'Testing score': 0.8625, 'Baseline': 0.8485059068797776}, {'Response': 'y_beatings', 'Training score': 0.968952734012975, 'Testing score': 0.9333333333333333, 'Baseline': 0.9204308547602502}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9253938832252085, 'Testing score': 0.7861111111111111, 'Baseline': 0.7029186935371786}, {'Response': 'y_ignore', 'Training score': 0.8243744207599629, 'Testing score': 0.775, 'Baseline': 0.5059068797776234}, {'Response': 'y_killings', 'Training score': 0.9670991658943466, 'Testing score': 0.9513888888888888, 'Baseline': 0.9444058373870744}, {'Response': 'y_shootings', 'Training score': 0.9541241890639481, 'Testing score': 0.9430555555555555, 'Baseline': 0.9503127171646977}]

Region: South America




[{'Response': 'y_accomodation', 'Training score': 0.9380831212892281, 'Testing score': 0.8982188295165394, 'Baseline': 0.8867684478371501}, {'Response': 'y_arrests', 'Training score': 0.9499575911789653, 'Testing score': 0.9134860050890585, 'Baseline': 0.8842239185750637}, {'Response': 'y_beatings', 'Training score': 0.9881255301102629, 'Testing score': 0.9847328244274809, 'Baseline': 0.9872773536895675}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9448685326547922, 'Testing score': 0.8524173027989822, 'Baseline': 0.6762086513994912}, {'Response': 'y_ignore', 'Training score': 0.905852417302799, 'Testing score': 0.7659033078880407, 'Baseline': 0.5674300254452926}, {'Response': 'y_killings', 'Training score': 0.9669211195928753, 'Testing score': 0.9465648854961832, 'Baseline': 0.9472010178117048}, {'Response': 'y_shootings', 'Training score': 0.9940627650551315, 'Testing score': 0.9796437659033079, 'Baseline': 0.9713740458015268}]

Region: MENA




[{'Response': 'y_accomodation', 'Training score': 0.9704069050554871, 'Testing score': 0.9114391143911439, 'Baseline': 0.9168207024029575}, {'Response': 'y_arrests', 'Training score': 0.8939580764488286, 'Testing score': 0.8634686346863468, 'Baseline': 0.8428835489833642}, {'Response': 'y_beatings', 'Training score': 0.9605425400739828, 'Testing score': 0.9188191881918819, 'Baseline': 0.9131238447319778}, {'Response': 'y_crowd_dispersal', 'Training score': 0.9753390875462392, 'Testing score': 0.8413284132841329, 'Baseline': 0.6913123844731978}, {'Response': 'y_ignore', 'Training score': 0.87422934648582, 'Testing score': 0.8154981549815498, 'Baseline': 0.5600739371534196}, {'Response': 'y_killings', 'Training score': 0.9815043156596794, 'Testing score': 0.8929889298892989, 'Baseline': 0.8826247689463955}, {'Response': 'y_shootings', 'Training score': 0.9013563501849569, 'Testing score': 0.8708487084870848, 'Baseline': 0.8706099815157117}]

Region: North America
[{'Response': 'y_accomod

KeyboardInterrupt: 

## FNN

In [44]:
def protest_by_the_response(df, response):
    X = df[['notes']]
    y = df[response]

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    def fnn(hl_1=32, hl_2=16, d1=0.5, d2=0.5):
        model = Sequential()
        
        model.add(Dense(hl_1, activation='relu', input_shape=(X_train.shape[1],)))
        model.add(Dropout(d1))
        
        model.add(Dense(hl_2, activation='relu'))
        model.add(Dropout(d2))
        
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(loss='bce', optimizer='adam')
        
        return model
    
    fnn_model = KerasClassifier(fnn, epochs=10, batch_size=512, verbose=0)
    
    pipe = Pipeline([
        ('cvec', CountVectorizer(stop_words='english')),
        ('fnn', fnn_model)])
    
    params = {
        'fnn__epochs' :[10, 30, 50]
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)

    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
    
    reg_response_dict = {
        "Response": response,
        "Training score": gs.score(X_train, y_train),
        "Testing score": gs.score(X_test, y_test),
        "Baseline": baseline
    }
    
    return reg_response_dict

def responses_by_location(loc_df):
    return [protest_by_the_response(loc_df, response) for response in possible_responses]

def df_by_region(df, region):
    return df[df["region"]==region]

def df_by_country(df, country):
    return df[df["country"]==country]

for region in region_list:
    #Instead of printing, append lists of dictionaries to a dataframe
    print(f"Region: {region}")
    print(responses_by_location(df_by_region(df, region)))
    print()

Region: Europe


ValueError: Dimensions 1 and 3588 are not compatible

In [37]:
X = df[['notes']]
y = df['y_ignore']

X_train, X_test, y_train, y_test = train_test_split(X, y)

def fnn(hl_1=32):
    model = Sequential()

    model.add(Dense(h1_1, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.5))

    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='bce', optimizer='adam')

    return model

fnn_model = KerasClassifier(fnn, epochs=10, batch_size=512, verbose=0)


pipe = Pipeline([(
'cvec', CountVectorizer(stop_words='english')),
('fnn', fnn_model)])

In [38]:
pipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer(stop_words='english')),
  ('fnn',
   <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x7fe6107cde80>)],
 'verbose': False,
 'cvec': CountVectorizer(stop_words='english'),
 'fnn': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x7fe6107cde80>,
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': 'english',
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'fnn__epochs': 10,
 'fnn__batch_size': 512,
 'fnn__verbose': 0,
 'fnn__build_fn': <function __main__.fnn(hl_1=32)>}