In [17]:
import pandas as pd
from sklearn import preprocessing
from sklearn import svm

def serialize(dataFrame, column):
    return [x for x in range(len(dataFrame[column].unique()))]

def replaceDict(dataFrame, column):
    vals = serialize(dataFrame, column)
    return dict(zip(dataFrame[column].unique(), vals))



def multiSearch(df, column, searchTerms):
    if type(searchTerms) is list:
        return df.query(' | '.join(
            [f'{column} == "{term}"' for term in searchTerms]
        ))
    elif type(searchTerms) is str:
        return df.query(f'{column} == "{searchTerms}"')
    else:
        return df.query(f'{column} == {searchTerms}')

def multiContains(df, column, searchTerms):
    if type(searchTerms) is list:
        return df[df[column].str.contains('|'.join(searchTerms))]
    else:
        return df[df[column].str.contains(searchTerms)]


UNREST_COLUMNS = ["EVENT_ID_CNTY", 
                    "EVENT_DATE", 
                    "EVENT_TYPE", 
                    "REGION", 
                "FATALITIES"]

CASES_COLUMNS = ["iso_code",
                "continent",
                "location", 
                "date", 
                "total_cases", 
                "new_cases", 
                "total_deaths", 
                "reproduction_rate", 
                "hosp_patients", 
                "positive_rate", 
                "stringency_index", 
                "population",
                "median_age",
                "gdp_per_capita",
                "life_expectancy"
            ]

unrest_df = pd.read_csv("./coronavirus_Oct31.csv")
covid_cases_df = pd.read_csv("./owid-covid-data.csv")





unrest_df = unrest_df[unrest_df.columns.intersection(UNREST_COLUMNS)]
covid_cases_df = covid_cases_df[covid_cases_df.columns.intersection(CASES_COLUMNS)]

irl_afg = multiSearch(covid_cases_df, 'iso_code', ['AFG', 'IRL'])

irl_afg_unrest = multiContains(unrest_df, "EVENT_ID_CNTY", ['AFG', 'IRL'])


unrest_afg = multiContains(unrest_df,"EVENT_ID_CNTY","AFG")

cases_afg = multiSearch(covid_cases_df, 'iso_code', "AFG")

unrest_afg.EVENT_DATE = pd.to_datetime(unrest_afg.EVENT_DATE)
cases_afg.date = pd.to_datetime(cases_afg.date)

merge = unrest_afg.merge(cases_afg, how="inner", left_on="EVENT_DATE", right_on="date")

classification = serialize(unrest_df, "EVENT_TYPE")

issueType = unrest_afg['EVENT_TYPE']
issueType = issueType.replace(replaceDict(unrest_df, "EVENT_TYPE"))

merge = merge.drop(['EVENT_TYPE', 'EVENT_DATE', 'EVENT_ID_CNTY', 'REGION', 'iso_code', 'continent', 'location', 'date'], axis=1).fillna(0)
# print(merge)
# mergeNormal = normalize(merge)
min_max_scaler = preprocessing.MinMaxScaler()

mergeNormal = min_max_scaler.fit_transform(merge)
mergeNormal = pd.DataFrame(mergeNormal)
mergeNormal

clf = svm.SVC()
clf.fit(mergeNormal, issueType)
# print(issueType)
clf.predict(mergeNormal)






array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3])

In [14]:
import pandas as pd
from sklearn import preprocessing
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score

def serialize(dataFrame, column):
    return [x for x in range(len(dataFrame[column].unique()))]

def replaceDict(dataFrame, column):
    vals = serialize(dataFrame, column)
    return dict(zip(dataFrame[column].unique(), vals))



def multiSearch(df, column, searchTerms):
    if type(searchTerms) is list:
        return df.query(' | '.join(
            [f'{column} == "{term}"' for term in searchTerms]
        ))
    elif type(searchTerms) is str:
        return df.query(f'{column} == "{searchTerms}"')
    else:
        return df.query(f'{column} == {searchTerms}')

def multiContains(df, column, searchTerms):
    if type(searchTerms) is list:
        return df[df[column].str.contains('|'.join(searchTerms))]
    else:
        return df[df[column].str.contains(searchTerms)]


UNREST_COLUMNS = ["EVENT_ID_CNTY", 
                    "EVENT_DATE", 
                    "EVENT_TYPE", 
                    "REGION", 
                "FATALITIES",
                "TIMESTAMP"]

CASES_COLUMNS = ["iso_code",
                "continent",
                "location", 
                "date", 
                "total_cases", 
                "new_cases", 
                "total_deaths", 
                "reproduction_rate", 
                "hosp_patients", 
                "positive_rate", 
                "stringency_index", 
                "population",
                "median_age",
                "gdp_per_capita",
                "life_expectancy",
            ]

unrest_df = pd.read_csv("./coronavirus_Oct31.csv")
covid_cases_df = pd.read_csv("./owid-covid-data.csv")
min_max_scaler = preprocessing.MinMaxScaler()

unrest_df = unrest_df[unrest_df.columns.intersection(UNREST_COLUMNS)]
covid_cases_df = covid_cases_df[covid_cases_df.columns.intersection(CASES_COLUMNS)]

afg_unrest = multiContains(unrest_df, "EVENT_ID_CNTY", 'ARG')
afg = multiSearch(covid_cases_df, 'iso_code', 'ARG')

afg_unrest.EVENT_DATE = pd.to_datetime(unrest_afg.EVENT_DATE)
afg.date = pd.to_datetime(cases_afg.date)

merge2 = afg_unrest.merge(afg, how="inner", left_on="EVENT_DATE", right_on="date")


merge2 = merge2.drop(['EVENT_ID_CNTY'], axis=1)
merge2 = merge2.drop_duplicates()

issueType2 = merge2['EVENT_TYPE']
issueType2 = issueType2.replace(replaceDict(unrest_df, "EVENT_TYPE"))

merge2 = merge2.drop(['EVENT_TYPE', 'EVENT_DATE', 'REGION', 'iso_code', 'continent', 'location', 'date'], axis=1).fillna(0)



mergeNormal2 = min_max_scaler.fit_transform(merge2)
mergeNormal2 = pd.DataFrame(mergeNormal2)


unrest_afg = multiContains(unrest_df,"EVENT_ID_CNTY",['AFG','ARG'])
cases_afg = multiSearch(covid_cases_df, 'iso_code', ['AFG','ARG'])
print(unrest_afg)
unrest_afg.EVENT_DATE = pd.to_datetime(unrest_afg.EVENT_DATE)
cases_afg.date = pd.to_datetime(cases_afg.date)

merge = unrest_afg.merge(cases_afg, how="inner", left_on="EVENT_DATE", right_on="date")

classification = serialize(unrest_df, "EVENT_TYPE")

merge = merge.drop(['EVENT_ID_CNTY'], axis=1)
merge = merge.drop_duplicates()

issueType = merge['EVENT_TYPE']
#print(unrest_afg)
issueType = issueType.replace(replaceDict(unrest_df, "EVENT_TYPE"))

merge = merge.drop(['EVENT_TYPE', 'EVENT_DATE', 'REGION', 'iso_code', 'continent', 'location', 'date'], axis=1).fillna(0)
# print(merge)
# mergeNormal = normalize(merge)


mergeNormal = min_max_scaler.fit_transform(merge)
mergeNormal = pd.DataFrame(mergeNormal)
mergeNormal

#clf = svm.SVC()
#print(issueType)
#clf.fit(mergeNormal, issueType)

#clf.predict(mergeNormal)

dTree = DecisionTreeClassifier(max_depth=5)
dTree.fit(mergeNormal, issueType)
pred = dTree.predict(mergeNormal)
pred2 = dTree.predict(mergeNormal2)
print(precision_score(issueType, pred, average="micro"))
print(precision_score(issueType2, pred2, average="micro"))




     EVENT_ID_CNTY       EVENT_DATE              EVENT_TYPE  \
24        AFG44949    22-March-2020  Strategic developments   
25        AFG44932    25-March-2020  Strategic developments   
26        AFG44930    25-March-2020  Strategic developments   
27        AFG44931    25-March-2020  Strategic developments   
28        AFG44951    26-March-2020  Strategic developments   
...            ...              ...                     ...   
1248       ARG3568  30-October-2020                Protests   
1249       ARG3570  31-October-2020                Protests   
1250       ARG3571  31-October-2020                   Riots   
1251       ARG3572  31-October-2020                Protests   
1252       ARG3569  31-October-2020                Protests   

                         REGION  FATALITIES   TIMESTAMP  
24    Caucasus and Central Asia           0  1586190074  
25    Caucasus and Central Asia           0  1586190085  
26    Caucasus and Central Asia           0  1586190085  
27    Cauca