In [56]:
#!/usr/bin/env python
# coding: utf-8

import pandas
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## These bands are helpful for visualization
def cni_bands(CNI):
    if CNI > 1 and CNI <=1.7 :
        return 1.0
    elif CNI >1.7 and CNI <=2.5 :
        return 2.0
    elif CNI >2.5 and CNI <=3.3 :
        return 3.0
    elif CNI >3.3 and CNI <=4.1 :
        return 4.0
    elif CNI >4.1 :
        return 5.0

#we will be using the GINI decision tree model
def train_using_gini(X_train, X_test, y_train):

    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",
                                      random_state = 100,max_depth=3, min_samples_leaf=5)
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

# use this function to do the prediction
def prediction(X_test, clf_object):
    # Prediction on test with giniIndex
    y_pred = clf_object.predict(X_test)
    return y_pred

# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):

    print("Confusion Matrix: ",
    confusion_matrix(y_test, y_pred))
    print ("Accuracy : ",
    accuracy_score(y_test,y_pred)*100)
    print("Report : ",
    classification_report(y_test, y_pred))

def covid_risk_bands_v2(covid_change_rate_multiplied):
    if covid_change_rate_multiplied < 0.013081 :
        return "Low risk"
    if covid_change_rate_multiplied < 0.075136 :
        return "Medium risk"
    if covid_change_rate_multiplied < 0.299020 :
        return "High risk"
    elif covid_change_rate_multiplied >= 0.299020 :
        return "Very high risk"
    else :
        return "Unknown"

## based ona a zip, return the predicted score
def get_risk_for_zip(zip):

    if len(lac_zips[lac_zips['Zip Code'] == zip]) == 0 :
        print (str(zip) + "Not in Los Angeles")
        return "Unknown"

    elif (len(lacounty_today[lacounty_today['zip_code'] == zip]) >= 1) :
        print ("Found an existing city with Covid Data for " + str(zip) )
        x = lacounty_today[lacounty_today['zip_code'] == zip]
        data= [[x.iloc[0]['population'],x.iloc[0]['cni'], x.iloc[0]['covid_change_rate_multiplied']]]
    
    else :
        x = lac_zips[lac_zips['Zip Code'] == zip]
        # check our data to see if the place matches something we already know
        print ("Need to find Zip level data" + str(zip))
        y = lacounty_today[lacounty_today['place'] == x.iloc[0]['City']]
        if len(y) > 0 :
            # use that city/place's values instead
            data= [[y.iloc[0]['population'],y.iloc[0]['cni'], y.iloc[0]['covid_change_rate_multiplied']]]
            print ("using " + y.iloc[0]['place'] + ' data to get the risk')
        else :
            data = [[x.iloc[0]['Population'],x.iloc[0]['CNI'], 0.0]]
    
    data = pandas.DataFrame(data, columns = ['population', 'cni', 'covid_change_rate_multiplied'])
    zip_pred = prediction(data, clf_gini)
    
    return (zip_pred[0])


In [2]:
## Get all data available
## lots of preprocessing done within the Excel sheets - can be converted to Pandas
lacounty_data = pandas.read_excel("CovidData.xlsx", sheet_name="LACCovidData")

# clean the Data
lacounty_data = lacounty_data.fillna(0)
lacounty_data['Zip code']=lacounty_data['Zip code'].fillna(0)
lacounty_data['Zip code']=lacounty_data['Zip code'].astype(int)



In [50]:
lac_zips = pandas.read_excel("CovidData.xlsx", sheet_name="LACZipsPopCNI")

In [11]:
#Get the latest data
lacounty_today = lacounty_data.groupby('place').agg(
    place = ('place', 'max'),
    zip_code = ('Zip code', 'max'),
    covid_rate = ('Covid Rate', 'max'),
    population = ('Population', 'max'),
    confirmed_cases = ('confirmed_cases', 'max') ,
    cni = ('CNI','max'),
    covid_change_rate = ('Covid Change Rate', 'mean')
    )

lacounty_today['covid_change_rate_multiplied'] = lacounty_today['covid_change_rate'] * lacounty_today['covid_rate'] * 1000

In [12]:
# lacounty_today.corr(method ='spearman')
# this correlation is high between CNI and covid_change_rate_multiplied

lacounty_today['cni_bands'] = lacounty_today['cni'].apply(cni_bands)
lacounty_today['covid_risk_bands'] = lacounty_today['covid_change_rate_multiplied'].apply(covid_risk_bands_v2)



In [13]:
## set up the data for training
X = lacounty_today[['population','cni', 'covid_change_rate_multiplied']]
Y = lacounty_today['covid_risk_bands']

## setting up the training the train/test (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(
          X, Y, test_size = 0.3, random_state = 100)

# imputing missing values with the means
X_test = X_test.fillna(X_test.mean())
X_train = X_train.fillna(X_train.mean())
clf_gini = train_using_gini(X_train, X_test, y_train)
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)


Predicted values:
['High risk' 'Medium risk' 'Low risk' 'Very high risk' 'High risk'
 'Medium risk' 'Very high risk' 'Medium risk' 'Low risk' 'Very high risk'
 'Low risk' 'Low risk' 'Very high risk' 'Very high risk' 'Very high risk'
 'Medium risk' 'Medium risk' 'Very high risk' 'Low risk' 'Medium risk'
 'Low risk' 'Medium risk' 'Very high risk' 'Very high risk' 'Medium risk'
 'Very high risk' 'Very high risk' 'Medium risk' 'High risk' 'High risk'
 'High risk' 'Medium risk' 'High risk' 'Medium risk' 'Low risk'
 'High risk' 'Very high risk' 'Low risk' 'Low risk' 'High risk' 'Low risk'
 'Low risk' 'Very high risk' 'Low risk' 'High risk' 'Low risk' 'Low risk'
 'Low risk' 'Low risk' 'High risk' 'High risk' 'High risk'
 'Very high risk' 'High risk' 'High risk' 'Low risk' 'Medium risk'
 'Very high risk' 'High risk' 'Low risk' 'High risk' 'High risk'
 'High risk' 'Low risk' 'Medium risk' 'Medium risk' 'Medium risk'
 'Very high risk' 'Very high risk' 'Medium risk' 'High risk' 'High risk'
 'Medi

In [47]:
lac_zips_all = pandas.read_excel("CovidData.xlsx", sheet_name="LAC Zips")
lac_zips_all = lac_zips_all.replace(to_replace = r"ZIP Code ", value = "", regex=True)
lac_zips_all['ZIP Code'] = lac_zips_all['ZIP Code'].astype(int)

In [48]:
lac_zips_all.head()

Unnamed: 0,ZIP Code,Classification,City,Population,Timezone,Area Code(s)
0,90001,General,Los Angeles,57110,Pacific,323
1,90002,General,Los Angeles,51223,Pacific,323/562
2,90003,General,Los Angeles,66266,Pacific,323/213
3,90004,General,Los Angeles,62180,Pacific,323/213
4,90005,General,Los Angeles,37681,Pacific,323/213/310/818/626


In [57]:
lac_zips_all['covid_risk_risk'] = lac_zips_all['ZIP Code'].apply(get_risk_for_zip)

Found an existing city with Covid Data for 90001
Found an existing city with Covid Data for 90002
Found an existing city with Covid Data for 90003
Found an existing city with Covid Data for 90004
Need to find Zip level data90005
Found an existing city with Covid Data for 90006
Found an existing city with Covid Data for 90007
Found an existing city with Covid Data for 90008
90009Not in Los Angeles
Need to find Zip level data90010
Found an existing city with Covid Data for 90011
Found an existing city with Covid Data for 90012
Found an existing city with Covid Data for 90013
Found an existing city with Covid Data for 90014
Need to find Zip level data90015
Found an existing city with Covid Data for 90016
Need to find Zip level data90017
Found an existing city with Covid Data for 90018
Found an existing city with Covid Data for 90019
Found an existing city with Covid Data for 90020
Need to find Zip level data90021
Found an existing city with Covid Data for 90022
Need to find Zip level data

Need to find Zip level data90805
using Long Beach data to get the risk
Found an existing city with Covid Data for 90806
Need to find Zip level data90807
using Long Beach data to get the risk
Need to find Zip level data90808
using Long Beach data to get the risk
90809Not in Los Angeles
Need to find Zip level data90810
using Long Beach data to get the risk
Need to find Zip level data90813
using Long Beach data to get the risk
Need to find Zip level data90814
using Long Beach data to get the risk
Need to find Zip level data90815
using Long Beach data to get the risk
Need to find Zip level data90822
using Long Beach data to get the risk
90831Not in Los Angeles
90832Not in Los Angeles
90833Not in Los Angeles
90840Not in Los Angeles
90842Not in Los Angeles
90844Not in Los Angeles
90846Not in Los Angeles
90847Not in Los Angeles
90848Not in Los Angeles
90853Not in Los Angeles
90895Not in Los Angeles
Found an existing city with Covid Data for 91001
91003Not in Los Angeles
Found an existing city

In [58]:
lac_zips_all.head()

Unnamed: 0,ZIP Code,Classification,City,Population,Timezone,Area Code(s),covid_risk_risk
0,90001,General,Los Angeles,57110,Pacific,323,Very high risk
1,90002,General,Los Angeles,51223,Pacific,323/562,High risk
2,90003,General,Los Angeles,66266,Pacific,323/213,Medium risk
3,90004,General,Los Angeles,62180,Pacific,323/213,Medium risk
4,90005,General,Los Angeles,37681,Pacific,323/213/310/818/626,Low risk


In [45]:
get_risk_for_zip(90001)

Found an existing city with Covid Data for 90001


'Very high risk'

In [59]:
lac_zips_all.to_csv("lac_zips_risk.csv")