In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D, LeakyReLU
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import shuffle


pd.options.mode.chained_assignment = 'warn'

UNREST_COLUMNS = ["EVENT_ID_CNTY", 
                    "EVENT_DATE", 
                    "EVENT_TYPE", 
                    "REGION", 
                "FATALITIES",
                "TIMESTAMP"]

CASES_COLUMNS = ["iso_code",
                "continent",
                "location", 
                "date", 
                "total_cases", 
                "new_cases", 
                "total_deaths", 
                "reproduction_rate", 
                "hosp_patients", 
                "positive_rate", 
                "stringency_index", 
                "population",
                "median_age",
                "gdp_per_capita",
                "life_expectancy",
            ]

def serialize(dataFrame, column):
    return [x for x in range(len(dataFrame.loc[:, column].unique()))]

def replaceDict(dataFrame, column):
    vals = serialize(dataFrame, column)
    return dict(zip(dataFrame.loc[:, column].unique(), vals))

def multiSearch(df, column, searchTerms):
    if type(searchTerms) is list:
        return df.query(' | '.join(
            [f'{column} == "{term}"' for term in searchTerms]
        ))
    elif type(searchTerms) is str:
        return df.query(f'{column} == "{searchTerms}"')
    else:
        return df.query(f'{column} == {searchTerms}')

def multiContains(df, column, searchTerms):
    if type(searchTerms) is list:
        return df[df.loc[:, column].str.contains('|'.join(searchTerms))]
    else:
        return df[df.loc[:, column].str.contains(searchTerms)]

#Create the training data set of merged PD's and the result
def retrieveTrainingData(isoCodes=None):
    unrest_df = pd.read_csv("./drive/MyDrive/machine-learning/coronavirus_Oct31.csv")
    unrest_df = unrest_df[unrest_df.loc[:, "EVENT_TYPE"] != 'Strategic developments']
 
    covid_cases_df = pd.read_csv("./drive/MyDrive/machine-learning/owid-covid-data.csv")
    covid_cases_df = covid_cases_df[covid_cases_df.loc[:, "iso_code"] != "OWID_WRL"]
    covid_cases_df.dropna() 

    print(unrest_df.EVENT_TYPE.unique())

    unrest_df = unrest_df.loc[:, unrest_df.columns.intersection(UNREST_COLUMNS)]
    covid_cases_df = covid_cases_df.loc[:, covid_cases_df.columns.intersection(CASES_COLUMNS)]

    if (isoCodes == None):
      unrest = unrest_df
      cases = covid_cases_df
    else:
      unrest = multiContains(unrest_df, "EVENT_ID_CNTY", isoCodes)
      cases = multiSearch(covid_cases_df, 'iso_code', isoCodes)

    unrest.loc[:, "EVENT_DATE"] = pd.to_datetime(unrest.loc[:, "EVENT_DATE"])
    cases.loc[:, "date"] = pd.to_datetime(cases.loc[:, "date"]) 

    merge = unrest.merge(cases, how="inner", left_on="EVENT_DATE", right_on="date")

    merge = merge.drop(['EVENT_ID_CNTY'], axis=1)
    merge = merge.drop_duplicates()

    issueType = merge['EVENT_TYPE']
    issueType = issueType.replace(replaceDict(unrest_df, "EVENT_TYPE"))


    merge = merge.drop(['EVENT_TYPE', 'EVENT_DATE', 'REGION', 'iso_code', 'continent', 'location', 'date', 'TIMESTAMP', "FATALITIES"], axis=1).fillna(0)

    return merge, issueType

data, target = retrieveTrainingData()

#convert data to Tensorflow dataset
tensor_data = tf.data.Dataset.from_tensor_slices(
    (
        tf.cast(data.values, tf.float32),
        tf.cast(target, tf.int32)
    )
).shuffle(10, reshuffle_each_iteration=False)

print(tf.data.experimental.cardinality(tensor_data).numpy())

#split data into train and test data
TEST_DIVISOR = 10

def is_test_data(x,y):
  return x % 10 == TEST_DIVISOR #only returns every 10th row

def is_train_data(x,y):
  return not is_test_data(x,y)#returns the other rows

extract_data = lambda x,y:y

train_data = tensor_data.enumerate().filter(is_train_data)
#.map(extract_data).cache()
test_data = tensor_data.enumerate().filter(is_test_data)
#.map(extract_data).cache()

print(train_data)
print(test_data) 

print(tf.data.experimental.cardinality(train_data).numpy())
print(tf.data.experimental.cardinality(test_data).numpy())





[1;30;43mStreaming output truncated to the last 5000 lines.[0m
features:5359 target:(<tf.Tensor: shape=(11,), dtype=float32, numpy=
array([2.3800000e+02, 6.7000000e+01, 0.0000000e+00, 2.0500000e+00,
       0.0000000e+00, 7.0000002e-03, 6.8519997e+01, 3.4813868e+07,
       3.1900000e+01, 4.9045410e+04, 7.5129997e+01], dtype=float32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
features:5360 target:(<tf.Tensor: shape=(11,), dtype=float32, numpy=
array([1.260000e+02, 1.700000e+01, 1.400000e+01, 9.700000e-01,
       0.000000e+00, 0.000000e+00, 8.333000e+01, 3.393800e+04,
       0.000000e+00, 5.686147e+04, 8.497000e+01], dtype=float32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
features:5361 target:(<tf.Tensor: shape=(11,), dtype=float32, numpy=
array([1.9900000e+02, 5.2000000e+01, 0.0000000e+00, 2.2600000e+00,
       0.0000000e+00, 3.0000000e-03, 6.8059998e+01, 1.4593446e+08,
       3.9599998e+01, 2.4765953e+04, 7.2580002e+01], dtype=float32)>, <tf.Tensor: shape=(), dtype=int32, nu

KeyboardInterrupt: ignored