## DATA IMPORTING

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df1 = pd.read_csv('train.csv') #Training Dataset
df2 = pd.read_csv('test.csv') #Testing dataset
codf = pd.read_csv('US_COVID_data_state_level.csv') #Covid data of cities
cidf = pd.read_csv('city_details.csv') #Cities data
wadf = pd.read_csv('warehouse_mapping.csv') #Warehouse data

## Predicting Confirmed Cases

In [3]:
corl = codf[codf.conf_cases.notna()] #Non-empty cells of confirmed cases

In [4]:
a,b = list(corl.tot_cases),list(corl.conf_cases)

In [5]:
from sklearn.ensemble import RandomForestRegressor 
algo=RandomForestRegressor()
ratio = []
for i in range(len(a)):
    if b[i]!=0:
        ratio.append((a[i]-b[i])/(b[i]))
    else:
        ratio.append(0)
a=np.array(a).reshape(-1,1)
algo.fit(a,ratio)

In [6]:
for i in codf[codf.conf_cases.isna()].index:
    codf.iloc[i,3] = int(np.array(codf.iloc[i,2])/(1+algo.predict(np.array(codf.iloc[i,2].reshape(1,-1)))))

In [7]:
codf.sample(5)

Unnamed: 0,date,state,tot_cases,conf_cases
28412,09/05/21,MT,109924,99730.0
13194,28/08/20,AR,59583,49972.0
354,27/01/20,PA,0,0.0
6692,12/05/20,NE,8692,6530.0
1466,15/02/20,MS,0,0.0


## DATA CLEANING

In [8]:
cidf1 = cidf.drop(['country'],axis = 1) #dropping useless columns
cidf1 = cidf1.drop_duplicates()
cidf1.tail(),cidf1.shape

(            city  zip_code state  state_name
 554  WESTMINSTER     92683    CA  CALIFORNIA
 555      ONTARIO     91761    CA  CALIFORNIA
 556       TACOMA     98409    WA  WASHINGTON
 557      BOZEMAN     59718    MT     MONTANA
 558    CLACKAMAS     97015    OR      OREGON,
 (555, 4))

In [9]:
state = cidf1.state.unique()
num_cities = [cidf1[cidf1.state == st].zip_code.nunique() for st in state]
stct = pd.DataFrame({'state':state,'num_cities':num_cities})

In [10]:
value,counts = np.unique(cidf1.zip_code,return_counts=True)
value[counts>1],counts[counts>1]

(array([94080, 98027], dtype=int64), array([2, 2], dtype=int64))

In [11]:
cidf1[cidf1.zip_code == 94080]

Unnamed: 0,city,zip_code,state,state_name
370,S,94080,CA,CALIFORNIA
371,EL,94080,CA,CALIFORNIA


In [12]:
cidf1.drop(index=371,inplace=True)

In [13]:
cidf1[cidf1.zip_code == 98027]

Unnamed: 0,city,zip_code,state,state_name
69,ISSAQUAH,98027,WA,WASHINGTON
70,US,98027,WA,WASHINGTON


In [14]:
cidf1.drop(index=70,inplace=True)

## DATA PREPROCESSING

In [15]:
cidf_final = cidf1.merge(stct,on = 'state',how = 'left')

In [16]:
wadf.sample(5)

Unnamed: 0,wh_id,warehouse_name
345,376,WAREHOUSE 376 AUBURN HILLS
284,312,WAREHOUSE 312 MILFORD
374,424,WAREHOUSE 424 SIGNAL HILL
468,660,WAREHOUSE 660 PUYALLUP
440,631,WAREHOUSE 631 CUMBERLAND MALL


In [17]:
m = codf.merge(cidf_final,how = 'left',on='state') 
print(m.shape)
m['avg_tot_cases'] = m['tot_cases']/m['num_cities']
m['avg_conf_cases'] = m['conf_cases']/m['num_cities']
m.sample(5) #merging covid data and cities data using state as a common entity

(276208, 8)


Unnamed: 0,date,state,tot_cases,conf_cases,city,zip_code,state_name,num_cities,avg_tot_cases,avg_conf_cases
26500,08/03/20,AZ,5,4.0,TEMPE,85284.0,ARIZONA,18.0,0.277778,0.222222
9687,08/02/20,FL,0,0.0,PALM,33403.0,FLORIDA,28.0,0.0,0.0
272695,17/05/21,IL,1367214,1367214.0,CHAMPAIGN,61820.0,ILLINOIS,22.0,62146.090909,62146.090909
132740,12/09/20,CA,777850,655917.0,RANCHO,95742.0,CALIFORNIA,126.0,6173.412698,5205.690476
43341,07/04/20,CA,28792,28045.0,DANVILLE,94526.0,CALIFORNIA,126.0,228.507937,222.579365


In [18]:
b = []
for row in wadf.values:
  b.append(row[1].split()[2]) #taking 3rd word from warehouse_name which is city

wadf['city'] = np.array(b)
wadf.sample(5)

Unnamed: 0,wh_id,warehouse_name,city
91,1125,WAREHOUSE 1125 ROCHESTER MN,ROCHESTER
122,119,WAREHOUSE 119 MAUI,MAUI
327,358,WAREHOUSE 358 BRANDON,BRANDON
267,247,WAREHOUSE 247 NEWPORT NEWS,NEWPORT
64,1088,WAREHOUSE 1088 BOLINGBROOK,BOLINGBROOK


In [19]:
x = df1.merge(wadf.drop(axis = 1,columns=['warehouse_name']),how='left',on = 'wh_id')

In [20]:
print(x.shape)
x.sample(5)
# merged data of train and warehouse datasets

(453215, 4)


Unnamed: 0,date,wh_id,sales,city
178706,13/03/21,140.0,147,KONA
408992,23/02/19,736.0,20,CHANDLER
55425,17/11/20,110.0,558,ISSAQUAH
48655,23/12/19,1085.0,12,MELROSE
148332,02/08/19,13.0,4,SILVERDALE


In [21]:
z = x.merge(m,how='left', on = ['date','city'])

In [22]:
print(z.shape)
z.sample(5)

(669515, 12)


Unnamed: 0,date,wh_id,sales,city,state,tot_cases,conf_cases,zip_code,state_name,num_cities,avg_tot_cases,avg_conf_cases
468717,07/06/20,471.0,107,CAL,CA,158951.0,155962.0,95815.0,CALIFORNIA,126.0,1261.515873,1237.793651
318850,19/12/20,241.0,15,NEW,LA,282434.0,258439.0,70118.0,LOUISIANA,3.0,94144.666667,86146.333333
250058,20/12/20,1360.0,699,CLOVIS,CA,2061725.0,2061725.0,93612.0,CALIFORNIA,126.0,16362.896825,16362.896825
83763,08/08/19,1107.0,140,CHICAGO,,,,,,,,
624821,14/03/19,760.0,80,GILROY,,,,,,,,


In [23]:
#precovid data
precov = z[z.state.isna()][['date','wh_id','sales','tot_cases','conf_cases','avg_tot_cases','avg_conf_cases']].fillna(0)
cov = z[z.state.notna()][['date','wh_id','sales','tot_cases','conf_cases','avg_tot_cases','avg_conf_cases']]

## EDA

In [None]:
hi raaaaa