In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from collections import Counter

pd.options.mode.chained_assignment = 'warn'

UNREST_COLUMNS = ["EVENT_ID_CNTY", 
                    "EVENT_DATE", 
                    "EVENT_TYPE", 
                    "REGION", 
                "FATALITIES",
                "TIMESTAMP"]

CASES_COLUMNS = ["iso_code",
                "continent",
                "location", 
                "date", 
                "total_cases", 
                "new_cases", 
                "total_deaths", 
                "reproduction_rate", 
                "hosp_patients", 
                "positive_rate", 
                "stringency_index", 
                "population",
                "median_age",
                "gdp_per_capita",
                "life_expectancy",
            ]

def serialize(dataFrame, column):
    return [x for x in range(len(dataFrame.loc[:, column].unique()))]

def replaceDict(dataFrame, column):
    vals = serialize(dataFrame, column)
    return dict(zip(dataFrame.loc[:, column].unique(), vals))

def multiSearch(df, column, searchTerms):
    if type(searchTerms) is list:
        return df.query(' | '.join(
            [f'{column} == "{term}"' for term in searchTerms]
        ))
    elif type(searchTerms) is str:
        return df.query(f'{column} == "{searchTerms}"')
    else:
        return df.query(f'{column} == {searchTerms}')

def multiContains(df, column, searchTerms):
    if type(searchTerms) is list:
        return df[df.loc[:, column].str.contains('|'.join(searchTerms))]
    else:
        return df[df.loc[:, column].str.contains(searchTerms)]

#Create the training data set of merged PD's and the result
def retrieveTrainingData(isoCodes=None):
    unrest_df = pd.read_csv("./coronavirus_Oct31.csv")
    unrest_df = unrest_df[unrest_df.loc[:, "EVENT_TYPE"] != 'Strategic developments']
 
    covid_cases_df = pd.read_csv("./owid-covid-data.csv")
    covid_cases_df = covid_cases_df[covid_cases_df.loc[:, "iso_code"] != "OWID_WRL"]
    covid_cases_df.dropna() 

    classes = unrest_df.EVENT_TYPE.unique()
    print(classes)

    unrest_df = unrest_df.loc[:, unrest_df.columns.intersection(UNREST_COLUMNS)]
    covid_cases_df = covid_cases_df.loc[:, covid_cases_df.columns.intersection(CASES_COLUMNS)]

    if (isoCodes == None):
      unrest = unrest_df
      cases = covid_cases_df
    else:
      unrest = multiContains(unrest_df, "EVENT_ID_CNTY", isoCodes)
      cases = multiSearch(covid_cases_df, 'iso_code', isoCodes)

    unrest.loc[:, "EVENT_DATE"] = pd.to_datetime(unrest.loc[:, "EVENT_DATE"])
    cases.loc[:, "date"] = pd.to_datetime(cases.loc[:, "date"]) 

    merge = unrest.merge(cases, how="inner", left_on="EVENT_DATE", right_on="date")

    merge = merge.drop(['EVENT_ID_CNTY'], axis=1)
    merge = merge.drop_duplicates()

    issueType = merge['EVENT_TYPE']
    issueType = issueType.replace(replaceDict(unrest_df, "EVENT_TYPE"))

    merge = merge.drop(['EVENT_TYPE', 'EVENT_DATE', 'REGION', 'iso_code', 'continent', 'location', 'date', 'TIMESTAMP', "FATALITIES"], axis=1).fillna(0)

    return merge, issueType, classes

data, target, classes = retrieveTrainingData()

print(Counter(target))
print(len(target))

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

print(Counter(y_train))
#the data is fairly imbalanced so use random overfitting first to incrase ratios of minor classes
# over_sampler = RandomOverSampler(sampling_strategy={2:78712, 4:78712})
# under_sampler = RandomUnderSampler({0:111928, 1:105964})

# X_train, y_train = over_sampler.fit_resample(X_train, y_train)
# X_train, y_train = under_sampler.fit_resample(X_train, y_train)

resampler = SMOTEENN()
X_train, y_train = resampler.fit_resample(X_train, y_train)

print(Counter(y_train))

model = MLPClassifier(hidden_layer_sizes=5, alpha=1.0/5).fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("training\n", classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

print("test\n", classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))


['Protests' 'Riots' 'Battles' 'Violence against civilians'
 'Explosions/Remote violence']
Counter({0: 983573, 1: 265025, 3: 109838, 2: 15964, 4: 1869})
1376269
Counter({0: 786862, 1: 212039, 3: 87819, 2: 12785, 4: 1510})
Counter({4: 660170, 2: 435824, 0: 221379, 3: 142334, 1: 78468})
training
               precision    recall  f1-score   support

           0       0.93      0.02      0.03    221379
           1       0.00      0.00      0.00     78468
           2       0.00      0.00      0.00    435824
           3       0.00      0.00      0.00    142334
           4       0.43      1.00      0.60    660170

    accuracy                           0.43   1538175
   macro avg       0.27      0.20      0.13   1538175
weighted avg       0.32      0.43      0.26   1538175

[[  3863      0      0      0 217516]
 [     2      0      0      0  78466]
 [     0      0      0      0 435824]
 [     0      0      0      0 142334]
 [   281      0      0      0 659889]]
test
               preci

In [None]:
import matplotlib.pyplot as plt

# plt.rc()