In [36]:
import pandas as pd
from sklearn import preprocessing
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score
from sklearn.linear_model import RidgeClassifier


UNREST_COLUMNS = ["EVENT_ID_CNTY", 
                    "EVENT_DATE", 
                    "EVENT_TYPE", 
                    "REGION", 
                "FATALITIES",
                "TIMESTAMP"]

CASES_COLUMNS = ["iso_code",
                "continent",
                "location", 
                "date", 
                "total_cases", 
                "new_cases", 
                "total_deaths", 
                "reproduction_rate", 
                "hosp_patients", 
                "positive_rate", 
                "stringency_index", 
                "population",
                "median_age",
                "gdp_per_capita",
                "life_expectancy",
            ]

#Serialize the unique values in a dataframe column
def serialize(dataFrame, column):
    return [x for x in range(len(dataFrame[column].unique()))]

#Replace a column in a dataframe with a serialized version
def replaceDict(dataFrame, column):
    vals = serialize(dataFrame, column)
    return dict(zip(dataFrame[column].unique(), vals))

#Return rows where the specified column equals any one of the search terms
def multiSearch(df, column, searchTerms):
    if type(searchTerms) is list:
        return df.query(' | '.join(
            [f'{column} == "{term}"' for term in searchTerms]
        ))
    elif type(searchTerms) is str:
        return df.query(f'{column} == "{searchTerms}"')
    else:
        return df.query(f'{column} == {searchTerms}')

#Return rows where the specified column contains any one of the search terms
def multiContains(df, column, searchTerms):
    if type(searchTerms) is list:
        return df[df[column].str.contains('|'.join(searchTerms))]
    else:
        return df[df[column].str.contains(searchTerms)]

#Create the training data set of merged PD's and the result
def retrieveTrainingData(isoCodes):
    #Read in data from files
    unrest_df = pd.read_csv("./coronavirus_Oct31.csv")
    #Remove rows with event type of strategic developments
    unrest_df = unrest_df[unrest_df.EVENT_TYPE != 'Strategic developments']
 
    covid_cases_df = pd.read_csv("./owid-covid-data.csv")

    #Filter columns that wont be used as features
    unrest_df = unrest_df[unrest_df.columns.intersection(UNREST_COLUMNS)]
    covid_cases_df = covid_cases_df[covid_cases_df.columns.intersection(CASES_COLUMNS)]

    #Get data based on the input iso country codes
    unrest = multiContains(unrest_df, "EVENT_ID_CNTY", isoCodes)
    cases = multiSearch(covid_cases_df, 'iso_code', isoCodes)

    #Convert "date" type columns to dates
    unrest.EVENT_DATE = pd.to_datetime(unrest.EVENT_DATE)
    cases.date = pd.to_datetime(cases.date) 

    #Merge the two datasets with an inner join on the date fields
    merge = unrest.merge(cases, how="inner", left_on="EVENT_DATE", right_on="date")

    #Drop the iso code to avoid duplicates
    merge = merge.drop(['EVENT_ID_CNTY'], axis=1)
    #Drop remaining duplicates
    merge = merge.drop_duplicates()

    #Get the list of event types in this particular set of data
    issueType = merge['EVENT_TYPE']
    #Serialize the data and return it as the expected values for training
    issueType = issueType.replace(replaceDict(unrest_df, "EVENT_TYPE"))

    #Drop remaining unneeded data
    merge = merge.drop(['EVENT_TYPE', 'EVENT_DATE', 'REGION', 'iso_code', 'continent', 'location', 'date', 'TIMESTAMP'], axis=1).fillna(0)

    #Return
    return merge, issueType

trainingData, issueType = retrieveTrainingData(['AFG','ARG', 'BDG'])

#Normalize data between -1 and 1
trainingDataNormal = min_max_scaler.fit_transform(trainingData)
trainingDataNormal = pd.DataFrame(trainingDataNormal)

generalizedTest, issueType2 = retrieveTrainingData(['BOL'])

#Normalize data between -1 and 1
generalizedTestNormal = min_max_scaler.fit_transform(generalizedTest)
generalizedTestNormal = pd.DataFrame(generalizedTestNormal)


#Create Decision Tree Classifier
dTree = DecisionTreeClassifier(max_depth=5)
dTree.fit(trainingDataNormal, issueType)
pred = dTree.predict(trainingDataNormal)
pred2 = dTree.predict(generalizedTestNormal)
print("Decision tree")
print(precision_score(issueType, pred, average="micro"))
print(precision_score(issueType2, pred2, average="micro"))

#Create Ridge regression Classifier
ridge = RidgeClassifier()
ridge.fit(trainingDataNormal, issueType)
ridgePred = ridge.predict(trainingDataNormal)
ridgePredGen = ridge.predict(generalizedTestNormal)
print("Ridge")
print(precision_score(issueType, ridgePred, average="micro"))
print(precision_score(issueType2, ridgePredGen, average="micro"))


['Protests' 'Riots' 'Battles' 'Violence against civilians'
 'Explosions/Remote violence']
['Protests' 'Riots' 'Battles' 'Violence against civilians'
 'Explosions/Remote violence']
['Protests' 'Riots' 'Battles' 'Violence against civilians'
 'Explosions/Remote violence']
Decision tree
0.9046920821114369
0.803030303030303
Ridge
0.8914956011730205
0.803030303030303
