## Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve,plot_precision_recall_curve
from sklearn.metrics import precision_score, recall_score

# Loading the dataset created after Exploration

In [5]:
df = pd.read_csv('covid_dataset_v1.csv')
df.drop(['Unnamed: 0'],axis =1, inplace = True)
df.head()
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

## Checking and dealing with missing values, Redundant variables and further cleaning the dataset for the model.

In [6]:
df.isna().mean().round(4)*100

country                                         0.00
sex                                             0.00
age                                             0.00
height                                          0.00
weight                                          0.00
bmi                                             0.00
blood_type                                      0.00
insurance                                      78.48
smoking                                         0.00
alcohol                                         0.00
cannabis                                       17.37
amphetamines                                   19.51
cocaine                                        19.75
lsd                                            19.25
mdma                                           18.75
contacts_count                                  0.00
house_count                                     0.00
public_transport_count                         78.48
working                                       

In [7]:
# The dataset is too large and hence I am using the rows which have the most data
df.public_transport_count.fillna('None', inplace = True)
df_v1 = df.loc[df['public_transport_count'] != 'None' ]
df_v1.reset_index(inplace = True)
df_v1.drop(['index'],axis = 1, inplace = True)
df_v1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,country,sex,age,height,weight,bmi,blood_type,insurance,smoking,alcohol,cannabis,amphetamines,cocaine,lsd,mdma,contacts_count,house_count,public_transport_count,working,rate_reducing_risk_single,rate_reducing_risk_single_social_distancing,rate_reducing_risk_single_washing_hands,rate_reducing_risk_house,rate_reducing_risk_house_social_distancing,rate_reducing_risk_house_washing_hands,rate_reducing_risk_single_sanitizer,rate_reducing_mask,rate_government_action,rate_government_control,rate_government_spend,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,prescription_medication,opinion_infection,opinion_mortality,risk_infection
0,GB,male,45.0,184,72,21.2,on,no,yesheavy,9,-1.0,-1.0,-1.0,-1.0,-1.0,9,2.0,0,travel critical,0,1,1,1,0,0,0,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,13.0
1,US,male,35.0,178,80,25.2,unknown,yes,quit5,2,0.0,0.0,0.0,0.0,0.0,3,3.0,0,stopped,0,0,1,0,1,1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,,55.0,25.0,5.0
2,US,female,55.0,160,84,32.8,ap,yes,never,11,-1.0,-1.0,-1.0,-1.0,-1.0,4,3.0,0,stopped,0,1,1,0,1,0,-2,1,-1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,LEVOTHYROXINE SODIUM,35.0,25.0,5.0
3,MX,male,35.0,184,128,37.8,op,yes,yesmedium,1,-1.0,-1.0,-1.0,-1.0,-1.0,1,2.0,0,never,0,1,1,0,1,1,1,4,-1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,55.0,55.0,6.0
4,US,female,65.0,180,74,22.8,on,yes,never,-1,-1.0,-1.0,-1.0,-1.0,-1.0,0,1.0,0,never,0,2,2,0,0,0,-2,5,-2,2,2,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,ADALIMUMAB,,,5.0


In [8]:
print(df_v1.shape)
df_v1.isna().mean().round(4)*100

(188525, 49)


country                                         0.00
sex                                             0.00
age                                             0.00
height                                          0.00
weight                                          0.00
bmi                                             0.00
blood_type                                      0.00
insurance                                       0.00
smoking                                         0.00
alcohol                                         0.00
cannabis                                       73.47
amphetamines                                   74.61
cocaine                                        74.76
lsd                                            74.63
mdma                                           74.47
contacts_count                                  0.00
house_count                                     0.00
public_transport_count                          0.00
working                                       

In [9]:
print(df_v1.risk_infection.dtypes)
print(df_v1.shape)
# Dropping the rows which have missing Y values
df_v1.dropna(subset = ["risk_infection"], inplace=True)
print(df_v1.shape)
df_v1['public_transport_count'] = df_v1['public_transport_count'].astype(int)
print(df_v1.public_transport_count.dtypes)
print(df_v1.risk_infection.dtypes)
print(df_v1['public_transport_count'].corr(df_v1['risk_infection']))
#print(df_v1.isna().mean().round(4)*100)
df_v1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


float64
(188525, 49)
(188511, 49)
int32
float64
0.1960630685726218


Unnamed: 0,country,sex,age,height,weight,bmi,blood_type,insurance,smoking,alcohol,cannabis,amphetamines,cocaine,lsd,mdma,contacts_count,house_count,public_transport_count,working,rate_reducing_risk_single,rate_reducing_risk_single_social_distancing,rate_reducing_risk_single_washing_hands,rate_reducing_risk_house,rate_reducing_risk_house_social_distancing,rate_reducing_risk_house_washing_hands,rate_reducing_risk_single_sanitizer,rate_reducing_mask,rate_government_action,rate_government_control,rate_government_spend,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,prescription_medication,opinion_infection,opinion_mortality,risk_infection
0,GB,male,45.0,184,72,21.2,on,no,yesheavy,9,-1.0,-1.0,-1.0,-1.0,-1.0,9,2.0,0,travel critical,0,1,1,1,0,0,0,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,13.0
1,US,male,35.0,178,80,25.2,unknown,yes,quit5,2,0.0,0.0,0.0,0.0,0.0,3,3.0,0,stopped,0,0,1,0,1,1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,,55.0,25.0,5.0
2,US,female,55.0,160,84,32.8,ap,yes,never,11,-1.0,-1.0,-1.0,-1.0,-1.0,4,3.0,0,stopped,0,1,1,0,1,0,-2,1,-1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,LEVOTHYROXINE SODIUM,35.0,25.0,5.0
3,MX,male,35.0,184,128,37.8,op,yes,yesmedium,1,-1.0,-1.0,-1.0,-1.0,-1.0,1,2.0,0,never,0,1,1,0,1,1,1,4,-1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,55.0,55.0,6.0
4,US,female,65.0,180,74,22.8,on,yes,never,-1,-1.0,-1.0,-1.0,-1.0,-1.0,0,1.0,0,never,0,2,2,0,0,0,-2,5,-2,2,2,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,ADALIMUMAB,,,5.0


In [10]:
df_v1.isna().mean().round(4)*100

country                                         0.00
sex                                             0.00
age                                             0.00
height                                          0.00
weight                                          0.00
bmi                                             0.00
blood_type                                      0.00
insurance                                       0.00
smoking                                         0.00
alcohol                                         0.00
cannabis                                       73.47
amphetamines                                   74.61
cocaine                                        74.75
lsd                                            74.63
mdma                                           74.47
contacts_count                                  0.00
house_count                                     0.00
public_transport_count                          0.00
working                                       

In [11]:
# plotting the correlation of various variables
df_v1.corr()

Unnamed: 0,age,bmi,alcohol,cannabis,amphetamines,cocaine,lsd,mdma,contacts_count,house_count,public_transport_count,rate_reducing_risk_single,rate_reducing_risk_single_social_distancing,rate_reducing_risk_single_washing_hands,rate_reducing_risk_house,rate_reducing_risk_house_social_distancing,rate_reducing_risk_house_washing_hands,rate_reducing_risk_single_sanitizer,rate_reducing_mask,rate_government_action,rate_government_control,rate_government_spend,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,opinion_infection,opinion_mortality,risk_infection
age,1.0,0.03791,0.155396,-0.127569,-0.007074,-0.018802,-0.015071,-0.034534,-0.17622,-0.190136,-0.044972,,0.019292,-0.041904,-0.004502,0.035762,-0.019818,-0.068886,-0.029775,0.101257,-0.041152,-0.105481,0.01975,-0.044273,-0.062062,-0.061607,0.042921,0.028564,0.030844,0.146484,0.098433,0.154346,0.008929,0.26731,0.025178,0.047303,-0.048906,-0.139816,0.143613,-0.066166
bmi,0.03791,1.0,-0.085311,0.018637,0.00908,0.028454,0.035449,0.031303,0.015306,0.017885,-0.022233,,0.023202,0.002057,-0.004916,0.009321,-0.005062,0.027013,0.026157,-0.070689,0.04326,0.035203,0.013826,0.013624,0.010446,0.095556,0.017679,0.042648,0.050272,0.037562,0.028367,0.17508,0.001177,0.218974,0.062196,0.00157,0.011686,0.074103,0.200231,0.031273
alcohol,0.155396,-0.085311,1.0,0.103241,0.049249,0.135871,0.106025,0.120614,0.008509,-0.054667,0.007742,,-0.020362,-0.017773,-0.000749,0.01172,-0.001993,-0.025307,-0.045757,0.011092,-0.03415,-0.007344,-0.023562,-0.025311,-0.013566,-0.024181,-0.010963,-0.00547,-0.041884,-0.002432,-0.004678,-0.0576,0.008823,0.022589,-0.056322,-0.013491,-0.018203,-0.026274,-0.067579,-0.038655
cannabis,-0.127569,0.018637,0.103241,1.0,0.094118,0.152867,0.212318,0.187045,0.049281,-0.022329,0.044233,,-0.031909,-0.038603,-0.061522,-0.024542,-0.022153,-0.010595,0.045123,-0.184336,0.049167,0.071549,0.002688,0.021343,0.003057,0.056259,0.019618,0.0304,0.055385,0.010439,0.037658,-0.009359,0.03474,-0.024948,0.054607,-0.007951,-0.041875,0.039683,0.050872,0.011138
amphetamines,-0.007074,0.00908,0.049249,0.094118,1.0,0.266008,0.281352,0.278647,0.04979,0.02637,0.120318,,-0.063207,-0.072366,-0.009363,-0.05858,-0.064591,-0.042295,-0.040216,-0.027548,-0.026889,-0.022294,0.026322,0.014489,0.027193,0.03147,0.048428,0.029673,0.037722,0.034202,0.032493,0.022256,0.074864,0.019513,0.021824,0.015346,0.013161,0.019335,0.016903,0.059412
cocaine,-0.018802,0.028454,0.135871,0.152867,0.266008,1.0,0.514669,0.536452,0.073262,0.052116,0.206995,,-0.111597,-0.108529,-0.017237,-0.086966,-0.088366,-0.060114,-0.066873,-0.040164,-0.039461,-0.027863,0.022124,0.03966,0.03293,0.039996,0.078075,0.077509,0.029276,0.06392,0.070209,0.035382,0.102813,0.002621,0.006435,0.049792,-0.000777,0.005463,-0.007019,0.082738
lsd,-0.015071,0.035449,0.106025,0.212318,0.281352,0.514669,1.0,0.73499,0.057254,0.057116,0.228013,,-0.090416,-0.105361,-0.022199,-0.077633,-0.082913,-0.060996,-0.041358,-0.072833,-0.027012,-0.012439,0.016179,0.040343,0.032586,0.042631,0.095135,0.07985,0.045203,0.085563,0.072479,0.037512,0.116433,0.005128,0.020789,0.07089,0.002469,0.007206,-0.012692,0.082911
mdma,-0.034534,0.031303,0.120614,0.187045,0.278647,0.536452,0.73499,1.0,0.067841,0.062266,0.250478,,-0.106911,-0.115916,-0.022495,-0.086632,-0.091114,-0.066683,-0.066471,-0.059517,-0.041343,-0.0226,0.021514,0.05166,0.042332,0.038572,0.084979,0.075745,0.039489,0.075356,0.055943,0.034494,0.133445,-0.008842,0.015486,0.094104,0.004266,0.009181,-0.026872,0.092645
contacts_count,-0.17622,0.015306,0.008509,0.049281,0.04979,0.073262,0.057254,0.067841,1.0,0.23696,0.134719,,-0.225724,-0.062199,0.005851,-0.146856,-0.082041,0.042219,-0.050566,0.027576,-0.129178,-0.065314,0.022239,0.03446,0.160281,-0.002115,-0.001472,0.004681,-0.027886,-0.021023,-0.017247,-0.029298,-0.001886,-0.037328,-0.026723,0.012318,0.276224,0.14001,-0.082064,0.321613
house_count,-0.190136,0.017885,-0.054667,-0.022329,0.02637,0.052116,0.057116,0.062266,0.23696,1.0,0.024399,,-0.059415,-0.028817,0.00298,-0.032472,-0.005295,0.026006,-0.014863,0.019408,-0.051615,-0.032586,0.024508,0.02601,0.037909,0.016724,0.009453,0.006804,-0.009823,-0.015596,-0.018258,-0.01293,-0.013872,-0.0455,-0.013254,0.05536,0.025074,0.051958,-0.0549,0.093423


# Creating Dummy variables for str values

In [12]:
df_sex = pd.get_dummies(df_v1['sex'],prefix = 'sex_' )
df_blood_type = pd.get_dummies(df_v1['blood_type'], prefix = 'blood_type_')
df_smoking = pd.get_dummies(df_v1['smoking'], prefix = 'smoking_')
df_working = pd.get_dummies(df_v1['working'], prefix = 'working_')
df_insurance = pd.get_dummies(df_v1['insurance'], prefix = 'insurance_')

df_cleaned = pd.concat([ df_sex, df_blood_type, df_smoking, df_working, df_insurance, df_v1],axis = 1)
df_cleaned.drop(['sex','country','blood_type','prescription_medication','smoking','working','insurance','height','weight'],inplace = True, axis= 1)
print(df_cleaned.shape)
df_cleaned.head()

(188511, 68)


Unnamed: 0,sex__female,sex__male,sex__other,blood_type__abn,blood_type__abp,blood_type__an,blood_type__ap,blood_type__bn,blood_type__bp,blood_type__on,blood_type__op,blood_type__unknown,smoking__never,smoking__quit0,smoking__quit10,smoking__quit5,smoking__vape,smoking__yesheavy,smoking__yeslight,smoking__yesmedium,working__home,working__never,working__stopped,working__travel critical,working__travel non critical,insurance__blank,insurance__no,insurance__yes,age,bmi,alcohol,cannabis,amphetamines,cocaine,lsd,mdma,contacts_count,house_count,public_transport_count,rate_reducing_risk_single,rate_reducing_risk_single_social_distancing,rate_reducing_risk_single_washing_hands,rate_reducing_risk_house,rate_reducing_risk_house_social_distancing,rate_reducing_risk_house_washing_hands,rate_reducing_risk_single_sanitizer,rate_reducing_mask,rate_government_action,rate_government_control,rate_government_spend,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,opinion_infection,opinion_mortality,risk_infection
0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,45.0,21.2,9,-1.0,-1.0,-1.0,-1.0,-1.0,9,2.0,0,0,1,1,1,0,0,0,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,13.0
1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,35.0,25.2,2,0.0,0.0,0.0,0.0,0.0,3,3.0,0,0,0,1,0,1,1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,55.0,25.0,5.0
2,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,55.0,32.8,11,-1.0,-1.0,-1.0,-1.0,-1.0,4,3.0,0,0,1,1,0,1,0,-2,1,-1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35.0,25.0,5.0
3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,35.0,37.8,1,-1.0,-1.0,-1.0,-1.0,-1.0,1,2.0,0,0,1,1,0,1,1,1,4,-1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,55.0,55.0,6.0
4,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,65.0,22.8,-1,-1.0,-1.0,-1.0,-1.0,-1.0,0,1.0,0,0,2,2,0,0,0,-2,5,-2,2,2,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,,,5.0


In [13]:
df_cleaned.corr()

Unnamed: 0,sex__female,sex__male,sex__other,blood_type__abn,blood_type__abp,blood_type__an,blood_type__ap,blood_type__bn,blood_type__bp,blood_type__on,blood_type__op,blood_type__unknown,smoking__never,smoking__quit0,smoking__quit10,smoking__quit5,smoking__vape,smoking__yesheavy,smoking__yeslight,smoking__yesmedium,working__home,working__never,working__stopped,working__travel critical,working__travel non critical,insurance__blank,insurance__no,insurance__yes,age,bmi,alcohol,cannabis,amphetamines,cocaine,lsd,mdma,contacts_count,house_count,public_transport_count,rate_reducing_risk_single,rate_reducing_risk_single_social_distancing,rate_reducing_risk_single_washing_hands,rate_reducing_risk_house,rate_reducing_risk_house_social_distancing,rate_reducing_risk_house_washing_hands,rate_reducing_risk_single_sanitizer,rate_reducing_mask,rate_government_action,rate_government_control,rate_government_spend,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,opinion_infection,opinion_mortality,risk_infection
sex__female,1.0,-0.992548,-0.064328,0.004599,0.000274,0.030169,0.032327,0.009321,0.012234,0.016895,0.022425,-0.081373,0.037033,-0.025741,-0.005637,0.002069,-0.017907,-0.030107,-0.017462,-0.003858,-0.033109,0.013594,0.063092,-0.003913,-0.074787,0.005263,-0.015609,0.01098,-0.07564,0.053199,-0.082647,-0.010945,-0.022868,-0.054711,-0.050493,-0.046011,-0.002288,0.021892,-0.059478,,0.079525,0.099808,-0.002029,0.022243,0.043737,0.122334,0.098929,-0.126637,0.077879,0.098705,-0.013981,0.000134,0.000747,0.087026,-0.00562,-0.010219,0.094931,-0.048595,-0.004278,-0.038792,-0.050804,-0.087607,0.092765,-0.002145,0.073297,0.15039,0.138316,-0.013692
sex__male,-0.992548,1.0,-0.057756,-0.004681,-0.000254,-0.03064,-0.031819,-0.009627,-0.011909,-0.016926,-0.02157,0.080307,-0.034902,0.024687,0.006284,-0.002919,0.015929,0.030412,0.015803,0.003458,0.03352,-0.012617,-0.06468,0.003993,0.075229,-0.006562,0.014141,-0.008909,0.079503,-0.05505,0.084232,0.003474,0.020769,0.052118,0.044794,0.042519,0.003293,-0.020853,0.058982,,-0.081883,-0.100711,0.0026,-0.022732,-0.044303,-0.121923,-0.101277,0.131349,-0.081068,-0.10225,0.014239,-0.001123,-0.000618,-0.089531,0.006135,0.01,-0.097482,0.048802,0.003768,0.038525,0.050209,0.08842,-0.097184,0.002274,-0.073086,-0.152878,-0.141103,0.013599
sex__other,-0.064328,-0.057756,1.0,0.000657,-0.000162,0.003756,-0.004269,0.002474,-0.002707,0.000199,-0.00707,0.008994,-0.01757,0.008717,-0.005279,0.006954,0.016261,-0.002403,0.013641,0.003284,-0.00325,-0.008047,0.012796,-0.000638,-0.003373,0.01062,0.012069,-0.01699,-0.03139,0.014989,-0.012711,0.049786,0.014077,0.017596,0.038064,0.023409,-0.008222,-0.008583,0.004261,,0.019054,0.007062,-0.004664,0.003934,0.004486,-0.003768,0.018903,-0.038167,0.025861,0.028707,-0.002069,0.008099,-0.001054,0.020225,-0.0042,0.001826,0.020582,-0.001534,0.004192,0.002311,0.005038,-0.006374,0.035882,-0.001045,-0.001969,0.018676,0.021072,0.000811
blood_type__abn,0.004599,-0.004681,0.000657,1.0,-0.015851,-0.019114,-0.04397,-0.011241,-0.02336,-0.025594,-0.048144,-0.060008,-0.006784,-0.001592,0.010657,0.000155,0.001371,0.001834,0.000685,-0.000892,-0.000332,-0.000994,-0.001576,-9e-05,0.004097,-0.002703,0.000472,0.00114,0.002948,0.003216,0.000285,0.006353,0.011,0.004075,0.009179,0.009959,-0.002257,-0.003018,0.004129,,0.002265,-0.000475,0.003647,0.001571,0.003654,-0.000111,0.000922,-0.003138,0.001729,0.000834,-0.00377,-0.00396,-0.000576,0.005716,0.002038,0.003132,0.006968,0.000502,0.003422,-0.000638,-0.003223,-0.002015,0.000468,-0.00284,0.001019,0.001254,0.003124,-0.001654
blood_type__abp,0.000274,-0.000254,-0.000162,-0.015851,1.0,-0.039554,-0.090989,-0.02326,-0.048339,-0.052963,-0.099626,-0.124177,-0.009428,0.00243,0.005382,0.004285,-0.002649,0.003016,0.001837,0.005249,-0.001709,4.9e-05,-0.003235,0.001758,0.004256,-0.004297,-0.001676,0.003993,-0.00545,0.00441,0.001287,0.001141,-0.002113,-0.000425,0.002784,0.00063,0.001673,0.006726,0.008432,,0.000548,0.001538,-0.00197,0.002786,0.002835,0.008297,0.014,-0.001861,0.006068,0.003259,-0.000201,0.002834,0.005883,0.001324,-0.002247,-0.000107,0.007556,0.002707,0.004156,0.001923,-0.001295,-0.000856,0.0011,-0.00535,0.004902,0.005441,0.003077,0.007772
blood_type__an,0.030169,-0.03064,0.003756,-0.019114,-0.039554,1.0,-0.10972,-0.028049,-0.05829,-0.063866,-0.120136,-0.14974,0.000768,-0.001786,0.005414,0.001552,-0.001511,0.003957,-0.004945,-0.005701,0.001057,-0.004172,0.00292,0.00542,-0.005415,-0.010219,-0.008209,0.01329,0.012262,0.006755,0.008264,-0.009581,-0.001801,-0.002158,-0.00337,0.00113,0.004469,0.005331,-0.002911,,0.006852,0.00558,-0.001034,0.002524,0.00055,0.003592,0.007449,-0.002247,0.002955,0.001941,-0.007574,-0.00178,-0.005235,-0.00051,0.004751,0.008163,0.007766,-0.00024,0.002659,-0.004255,0.002586,0.004707,0.002264,-0.003304,0.007512,0.002458,0.002548,-0.003942
blood_type__ap,0.032327,-0.031819,-0.004269,-0.04397,-0.090989,-0.10972,1.0,-0.064523,-0.13409,-0.146917,-0.276358,-0.34446,0.019166,-0.01131,0.002081,-0.004063,-0.019398,-0.005976,-0.003647,-0.004674,0.000832,0.013024,-0.015306,0.012127,-0.010614,-0.023771,-0.030557,0.041214,0.0347,-1.3e-05,-0.002094,-0.048609,-0.010395,-0.017891,-0.009459,-0.01303,0.014058,0.020386,-0.00196,,0.015757,0.01584,0.016148,0.011462,0.014011,0.025813,0.044116,0.000674,0.005421,0.006151,0.004557,-0.003631,0.011027,-0.002036,0.005003,0.005996,-0.00108,0.004956,0.006505,0.002564,-0.003717,0.009632,0.006342,-0.001863,0.034067,0.005724,-0.005289,0.017045
blood_type__bn,0.009321,-0.009627,0.002474,-0.011241,-0.02326,-0.028049,-0.064523,1.0,-0.034279,-0.037558,-0.070648,-0.088058,-0.001888,0.003848,-0.000362,0.001156,-0.002317,0.000479,0.001654,-0.000112,0.003229,-0.002577,0.002081,-0.000368,-0.002014,-0.003986,0.001165,0.00126,0.002821,0.003556,0.004177,0.000215,0.010627,-0.000382,0.009216,0.008848,-0.003241,-0.001157,0.002124,,-0.001112,-0.000503,-0.001577,0.00155,0.001056,-0.002225,0.002168,0.000974,-0.001161,0.003435,-0.003371,0.000631,-0.000108,-0.000977,0.003475,0.003117,0.005971,-0.004522,-0.002871,-0.001679,0.006995,0.001119,0.003878,0.000164,0.00012,-0.000921,0.000357,-0.001633
blood_type__bp,0.012234,-0.011909,-0.002707,-0.02336,-0.048339,-0.05829,-0.13409,-0.034279,1.0,-0.078052,-0.146819,-0.182999,0.005471,-0.000888,-0.001597,-0.000543,-0.003635,-0.004651,0.001784,-0.004149,-0.001688,0.001121,-0.004799,0.009007,-0.003141,-0.006692,-0.018189,0.020217,0.002207,-0.004872,-0.014956,-0.017832,-0.002374,-0.011397,-0.011328,-0.006769,0.004381,0.017234,0.004937,,0.006996,0.005836,0.006917,0.005982,0.004184,0.015892,0.028868,0.002368,0.007606,0.006888,0.00352,0.000439,0.004096,-0.003699,-0.000557,-0.002923,-0.003304,-0.007243,-0.003289,0.000469,-0.003964,0.005392,0.000581,-0.001775,0.017932,0.000291,0.001106,0.010814
blood_type__on,0.016895,-0.016926,0.000199,-0.025594,-0.052963,-0.063866,-0.146917,-0.037558,-0.078052,1.0,-0.160863,-0.200504,-0.010714,9e-06,0.00501,0.005027,0.000208,-0.001005,0.004068,0.00768,0.00135,0.000102,-0.00277,0.00599,-0.004128,-0.002865,-0.006635,0.00762,0.00973,0.0126,0.011505,0.003646,0.009204,-0.002936,0.005775,0.004855,-0.000615,0.00328,-0.003133,,0.00632,0.005155,0.002747,0.00247,-0.001361,0.009118,-0.000546,0.003845,-0.006619,-0.00168,-0.001773,-0.001633,-0.005462,-0.003475,0.004138,0.001861,0.006906,-0.00581,-0.00105,-0.006927,0.002468,0.002265,0.001825,-0.000454,0.007222,-0.001656,-0.00172,-0.002358


In [14]:
df_cleaned.drop(['opinion_mortality','opinion_infection','rate_government_action', 'rate_government_control','rate_government_spend','rate_reducing_risk_single'	,'rate_reducing_risk_single_social_distancing'	,'rate_reducing_risk_single_washing_hands', 'rate_reducing_risk_house','rate_reducing_risk_house_social_distancing'	,'rate_reducing_risk_house_washing_hands'	,'rate_reducing_risk_single_sanitizer','rate_reducing_mask'],axis = 1, inplace = True)

In [15]:
df_cleaned.corr()

Unnamed: 0,sex__female,sex__male,sex__other,blood_type__abn,blood_type__abp,blood_type__an,blood_type__ap,blood_type__bn,blood_type__bp,blood_type__on,blood_type__op,blood_type__unknown,smoking__never,smoking__quit0,smoking__quit10,smoking__quit5,smoking__vape,smoking__yesheavy,smoking__yeslight,smoking__yesmedium,working__home,working__never,working__stopped,working__travel critical,working__travel non critical,insurance__blank,insurance__no,insurance__yes,age,bmi,alcohol,cannabis,amphetamines,cocaine,lsd,mdma,contacts_count,house_count,public_transport_count,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,risk_infection
sex__female,1.0,-0.992548,-0.064328,0.004599,0.000274,0.030169,0.032327,0.009321,0.012234,0.016895,0.022425,-0.081373,0.037033,-0.025741,-0.005637,0.002069,-0.017907,-0.030107,-0.017462,-0.003858,-0.033109,0.013594,0.063092,-0.003913,-0.074787,0.005263,-0.015609,0.01098,-0.07564,0.053199,-0.082647,-0.010945,-0.022868,-0.054711,-0.050493,-0.046011,-0.002288,0.021892,-0.059478,-0.013981,0.000134,0.000747,0.087026,-0.00562,-0.010219,0.094931,-0.048595,-0.004278,-0.038792,-0.050804,-0.087607,0.092765,-0.002145,0.073297,-0.013692
sex__male,-0.992548,1.0,-0.057756,-0.004681,-0.000254,-0.03064,-0.031819,-0.009627,-0.011909,-0.016926,-0.02157,0.080307,-0.034902,0.024687,0.006284,-0.002919,0.015929,0.030412,0.015803,0.003458,0.03352,-0.012617,-0.06468,0.003993,0.075229,-0.006562,0.014141,-0.008909,0.079503,-0.05505,0.084232,0.003474,0.020769,0.052118,0.044794,0.042519,0.003293,-0.020853,0.058982,0.014239,-0.001123,-0.000618,-0.089531,0.006135,0.01,-0.097482,0.048802,0.003768,0.038525,0.050209,0.08842,-0.097184,0.002274,-0.073086,0.013599
sex__other,-0.064328,-0.057756,1.0,0.000657,-0.000162,0.003756,-0.004269,0.002474,-0.002707,0.000199,-0.00707,0.008994,-0.01757,0.008717,-0.005279,0.006954,0.016261,-0.002403,0.013641,0.003284,-0.00325,-0.008047,0.012796,-0.000638,-0.003373,0.01062,0.012069,-0.01699,-0.03139,0.014989,-0.012711,0.049786,0.014077,0.017596,0.038064,0.023409,-0.008222,-0.008583,0.004261,-0.002069,0.008099,-0.001054,0.020225,-0.0042,0.001826,0.020582,-0.001534,0.004192,0.002311,0.005038,-0.006374,0.035882,-0.001045,-0.001969,0.000811
blood_type__abn,0.004599,-0.004681,0.000657,1.0,-0.015851,-0.019114,-0.04397,-0.011241,-0.02336,-0.025594,-0.048144,-0.060008,-0.006784,-0.001592,0.010657,0.000155,0.001371,0.001834,0.000685,-0.000892,-0.000332,-0.000994,-0.001576,-9e-05,0.004097,-0.002703,0.000472,0.00114,0.002948,0.003216,0.000285,0.006353,0.011,0.004075,0.009179,0.009959,-0.002257,-0.003018,0.004129,-0.00377,-0.00396,-0.000576,0.005716,0.002038,0.003132,0.006968,0.000502,0.003422,-0.000638,-0.003223,-0.002015,0.000468,-0.00284,0.001019,-0.001654
blood_type__abp,0.000274,-0.000254,-0.000162,-0.015851,1.0,-0.039554,-0.090989,-0.02326,-0.048339,-0.052963,-0.099626,-0.124177,-0.009428,0.00243,0.005382,0.004285,-0.002649,0.003016,0.001837,0.005249,-0.001709,4.9e-05,-0.003235,0.001758,0.004256,-0.004297,-0.001676,0.003993,-0.00545,0.00441,0.001287,0.001141,-0.002113,-0.000425,0.002784,0.00063,0.001673,0.006726,0.008432,-0.000201,0.002834,0.005883,0.001324,-0.002247,-0.000107,0.007556,0.002707,0.004156,0.001923,-0.001295,-0.000856,0.0011,-0.00535,0.004902,0.007772
blood_type__an,0.030169,-0.03064,0.003756,-0.019114,-0.039554,1.0,-0.10972,-0.028049,-0.05829,-0.063866,-0.120136,-0.14974,0.000768,-0.001786,0.005414,0.001552,-0.001511,0.003957,-0.004945,-0.005701,0.001057,-0.004172,0.00292,0.00542,-0.005415,-0.010219,-0.008209,0.01329,0.012262,0.006755,0.008264,-0.009581,-0.001801,-0.002158,-0.00337,0.00113,0.004469,0.005331,-0.002911,-0.007574,-0.00178,-0.005235,-0.00051,0.004751,0.008163,0.007766,-0.00024,0.002659,-0.004255,0.002586,0.004707,0.002264,-0.003304,0.007512,-0.003942
blood_type__ap,0.032327,-0.031819,-0.004269,-0.04397,-0.090989,-0.10972,1.0,-0.064523,-0.13409,-0.146917,-0.276358,-0.34446,0.019166,-0.01131,0.002081,-0.004063,-0.019398,-0.005976,-0.003647,-0.004674,0.000832,0.013024,-0.015306,0.012127,-0.010614,-0.023771,-0.030557,0.041214,0.0347,-1.3e-05,-0.002094,-0.048609,-0.010395,-0.017891,-0.009459,-0.01303,0.014058,0.020386,-0.00196,0.004557,-0.003631,0.011027,-0.002036,0.005003,0.005996,-0.00108,0.004956,0.006505,0.002564,-0.003717,0.009632,0.006342,-0.001863,0.034067,0.017045
blood_type__bn,0.009321,-0.009627,0.002474,-0.011241,-0.02326,-0.028049,-0.064523,1.0,-0.034279,-0.037558,-0.070648,-0.088058,-0.001888,0.003848,-0.000362,0.001156,-0.002317,0.000479,0.001654,-0.000112,0.003229,-0.002577,0.002081,-0.000368,-0.002014,-0.003986,0.001165,0.00126,0.002821,0.003556,0.004177,0.000215,0.010627,-0.000382,0.009216,0.008848,-0.003241,-0.001157,0.002124,-0.003371,0.000631,-0.000108,-0.000977,0.003475,0.003117,0.005971,-0.004522,-0.002871,-0.001679,0.006995,0.001119,0.003878,0.000164,0.00012,-0.001633
blood_type__bp,0.012234,-0.011909,-0.002707,-0.02336,-0.048339,-0.05829,-0.13409,-0.034279,1.0,-0.078052,-0.146819,-0.182999,0.005471,-0.000888,-0.001597,-0.000543,-0.003635,-0.004651,0.001784,-0.004149,-0.001688,0.001121,-0.004799,0.009007,-0.003141,-0.006692,-0.018189,0.020217,0.002207,-0.004872,-0.014956,-0.017832,-0.002374,-0.011397,-0.011328,-0.006769,0.004381,0.017234,0.004937,0.00352,0.000439,0.004096,-0.003699,-0.000557,-0.002923,-0.003304,-0.007243,-0.003289,0.000469,-0.003964,0.005392,0.000581,-0.001775,0.017932,0.010814
blood_type__on,0.016895,-0.016926,0.000199,-0.025594,-0.052963,-0.063866,-0.146917,-0.037558,-0.078052,1.0,-0.160863,-0.200504,-0.010714,9e-06,0.00501,0.005027,0.000208,-0.001005,0.004068,0.00768,0.00135,0.000102,-0.00277,0.00599,-0.004128,-0.002865,-0.006635,0.00762,0.00973,0.0126,0.011505,0.003646,0.009204,-0.002936,0.005775,0.004855,-0.000615,0.00328,-0.003133,-0.001773,-0.001633,-0.005462,-0.003475,0.004138,0.001861,0.006906,-0.00581,-0.00105,-0.006927,0.002468,0.002265,0.001825,-0.000454,0.007222,-0.002358


In [16]:
df_cleaned.drop(['alcohol','cannabis',	'amphetamines',	'cocaine',	'lsd'	,'mdma'], inplace = True, axis = 1)

In [17]:
df_cleaned.corr()

Unnamed: 0,sex__female,sex__male,sex__other,blood_type__abn,blood_type__abp,blood_type__an,blood_type__ap,blood_type__bn,blood_type__bp,blood_type__on,blood_type__op,blood_type__unknown,smoking__never,smoking__quit0,smoking__quit10,smoking__quit5,smoking__vape,smoking__yesheavy,smoking__yeslight,smoking__yesmedium,working__home,working__never,working__stopped,working__travel critical,working__travel non critical,insurance__blank,insurance__no,insurance__yes,age,bmi,contacts_count,house_count,public_transport_count,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,risk_infection
sex__female,1.0,-0.992548,-0.064328,0.004599,0.000274,0.030169,0.032327,0.009321,0.012234,0.016895,0.022425,-0.081373,0.037033,-0.025741,-0.005637,0.002069,-0.017907,-0.030107,-0.017462,-0.003858,-0.033109,0.013594,0.063092,-0.003913,-0.074787,0.005263,-0.015609,0.01098,-0.07564,0.053199,-0.002288,0.021892,-0.059478,-0.013981,0.000134,0.000747,0.087026,-0.00562,-0.010219,0.094931,-0.048595,-0.004278,-0.038792,-0.050804,-0.087607,0.092765,-0.002145,0.073297,-0.013692
sex__male,-0.992548,1.0,-0.057756,-0.004681,-0.000254,-0.03064,-0.031819,-0.009627,-0.011909,-0.016926,-0.02157,0.080307,-0.034902,0.024687,0.006284,-0.002919,0.015929,0.030412,0.015803,0.003458,0.03352,-0.012617,-0.06468,0.003993,0.075229,-0.006562,0.014141,-0.008909,0.079503,-0.05505,0.003293,-0.020853,0.058982,0.014239,-0.001123,-0.000618,-0.089531,0.006135,0.01,-0.097482,0.048802,0.003768,0.038525,0.050209,0.08842,-0.097184,0.002274,-0.073086,0.013599
sex__other,-0.064328,-0.057756,1.0,0.000657,-0.000162,0.003756,-0.004269,0.002474,-0.002707,0.000199,-0.00707,0.008994,-0.01757,0.008717,-0.005279,0.006954,0.016261,-0.002403,0.013641,0.003284,-0.00325,-0.008047,0.012796,-0.000638,-0.003373,0.01062,0.012069,-0.01699,-0.03139,0.014989,-0.008222,-0.008583,0.004261,-0.002069,0.008099,-0.001054,0.020225,-0.0042,0.001826,0.020582,-0.001534,0.004192,0.002311,0.005038,-0.006374,0.035882,-0.001045,-0.001969,0.000811
blood_type__abn,0.004599,-0.004681,0.000657,1.0,-0.015851,-0.019114,-0.04397,-0.011241,-0.02336,-0.025594,-0.048144,-0.060008,-0.006784,-0.001592,0.010657,0.000155,0.001371,0.001834,0.000685,-0.000892,-0.000332,-0.000994,-0.001576,-9e-05,0.004097,-0.002703,0.000472,0.00114,0.002948,0.003216,-0.002257,-0.003018,0.004129,-0.00377,-0.00396,-0.000576,0.005716,0.002038,0.003132,0.006968,0.000502,0.003422,-0.000638,-0.003223,-0.002015,0.000468,-0.00284,0.001019,-0.001654
blood_type__abp,0.000274,-0.000254,-0.000162,-0.015851,1.0,-0.039554,-0.090989,-0.02326,-0.048339,-0.052963,-0.099626,-0.124177,-0.009428,0.00243,0.005382,0.004285,-0.002649,0.003016,0.001837,0.005249,-0.001709,4.9e-05,-0.003235,0.001758,0.004256,-0.004297,-0.001676,0.003993,-0.00545,0.00441,0.001673,0.006726,0.008432,-0.000201,0.002834,0.005883,0.001324,-0.002247,-0.000107,0.007556,0.002707,0.004156,0.001923,-0.001295,-0.000856,0.0011,-0.00535,0.004902,0.007772
blood_type__an,0.030169,-0.03064,0.003756,-0.019114,-0.039554,1.0,-0.10972,-0.028049,-0.05829,-0.063866,-0.120136,-0.14974,0.000768,-0.001786,0.005414,0.001552,-0.001511,0.003957,-0.004945,-0.005701,0.001057,-0.004172,0.00292,0.00542,-0.005415,-0.010219,-0.008209,0.01329,0.012262,0.006755,0.004469,0.005331,-0.002911,-0.007574,-0.00178,-0.005235,-0.00051,0.004751,0.008163,0.007766,-0.00024,0.002659,-0.004255,0.002586,0.004707,0.002264,-0.003304,0.007512,-0.003942
blood_type__ap,0.032327,-0.031819,-0.004269,-0.04397,-0.090989,-0.10972,1.0,-0.064523,-0.13409,-0.146917,-0.276358,-0.34446,0.019166,-0.01131,0.002081,-0.004063,-0.019398,-0.005976,-0.003647,-0.004674,0.000832,0.013024,-0.015306,0.012127,-0.010614,-0.023771,-0.030557,0.041214,0.0347,-1.3e-05,0.014058,0.020386,-0.00196,0.004557,-0.003631,0.011027,-0.002036,0.005003,0.005996,-0.00108,0.004956,0.006505,0.002564,-0.003717,0.009632,0.006342,-0.001863,0.034067,0.017045
blood_type__bn,0.009321,-0.009627,0.002474,-0.011241,-0.02326,-0.028049,-0.064523,1.0,-0.034279,-0.037558,-0.070648,-0.088058,-0.001888,0.003848,-0.000362,0.001156,-0.002317,0.000479,0.001654,-0.000112,0.003229,-0.002577,0.002081,-0.000368,-0.002014,-0.003986,0.001165,0.00126,0.002821,0.003556,-0.003241,-0.001157,0.002124,-0.003371,0.000631,-0.000108,-0.000977,0.003475,0.003117,0.005971,-0.004522,-0.002871,-0.001679,0.006995,0.001119,0.003878,0.000164,0.00012,-0.001633
blood_type__bp,0.012234,-0.011909,-0.002707,-0.02336,-0.048339,-0.05829,-0.13409,-0.034279,1.0,-0.078052,-0.146819,-0.182999,0.005471,-0.000888,-0.001597,-0.000543,-0.003635,-0.004651,0.001784,-0.004149,-0.001688,0.001121,-0.004799,0.009007,-0.003141,-0.006692,-0.018189,0.020217,0.002207,-0.004872,0.004381,0.017234,0.004937,0.00352,0.000439,0.004096,-0.003699,-0.000557,-0.002923,-0.003304,-0.007243,-0.003289,0.000469,-0.003964,0.005392,0.000581,-0.001775,0.017932,0.010814
blood_type__on,0.016895,-0.016926,0.000199,-0.025594,-0.052963,-0.063866,-0.146917,-0.037558,-0.078052,1.0,-0.160863,-0.200504,-0.010714,9e-06,0.00501,0.005027,0.000208,-0.001005,0.004068,0.00768,0.00135,0.000102,-0.00277,0.00599,-0.004128,-0.002865,-0.006635,0.00762,0.00973,0.0126,-0.000615,0.00328,-0.003133,-0.001773,-0.001633,-0.005462,-0.003475,0.004138,0.001861,0.006906,-0.00581,-0.00105,-0.006927,0.002468,0.002265,0.001825,-0.000454,0.007222,-0.002358


# Loading the further cleaned Dataset

In [24]:
data = pd.read_csv('cleaned_covid_dataset.csv')
data.drop(['Unnamed: 0'],axis = 1, inplace = True)
data.head()

Unnamed: 0,sex__female,sex__male,sex__other,blood_type__abn,blood_type__abp,blood_type__an,blood_type__ap,blood_type__bn,blood_type__bp,blood_type__on,blood_type__op,blood_type__unknown,smoking__never,smoking__quit0,smoking__quit10,smoking__quit5,smoking__vape,smoking__yesheavy,smoking__yeslight,smoking__yesmedium,working__home,working__never,working__stopped,working__travel critical,working__travel non critical,insurance__blank,insurance__no,insurance__yes,age,bmi,contacts_count,house_count,public_transport_count,covid19_positive,covid19_symptoms,covid19_contact,asthma,kidney_disease,liver_disease,compromised_immune,heart_disease,lung_disease,diabetes,hiv_positive,hypertension,other_chronic,nursing_home,health_worker,risk_infection
0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,45.0,21.2,9,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13.0
1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,35.0,25.2,3,3.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,5.0
2,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,55.0,32.8,4,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0
3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,35.0,37.8,1,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,6.0
4,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,65.0,22.8,0,1.0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,5.0


In [57]:
# Filling the missing age values
data.age.fillna(data.age.mean(),inplace = True)
data.age.unique()

array([ 45.        ,  35.        ,  55.        ,  65.        ,
        25.        ,  85.        ,  15.        ,   5.        ,
        75.        ,  95.        , 105.        ,  44.25330221])

In [58]:
Y = data['risk_infection']
X = data.drop(['risk_infection'],axis = 1)
print(Y.shape, X.shape)

(188511,) (188511, 48)


## Creating the train and test split

In [60]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01, random_state = 0)

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)
Y_train = np.reshape(Y_train, (186625,1))
Y_test= np.reshape(Y_test, (1886,1))
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(186625, 48)
(1886, 48)
(186625, 1)
(1886, 1)


In [62]:
model = RandomForestRegressor(n_estimators = 2000, random_state=0, n_jobs = -1, verbose = 1)
model.fit(X_train,Y_train)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 16.8min finished


RandomForestRegressor(n_estimators=2000, n_jobs=-1, random_state=0, verbose=1)

In [63]:
model.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 2000 out of 2000 | elapsed:    0.1s finished


0.993476481873749

In [65]:
pred = model.predict(X_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 2000 out of 2000 | elapsed:    0.1s finished


In [67]:
# Checking the predicted vs True Values
for i in range(100):
    print("predicted " , pred[i], "actual = ", Y_test[i])

predicted  5.0 actual =  [5.]
predicted  13.0 actual =  [13.]
predicted  5.0 actual =  [5.]
predicted  13.0 actual =  [13.]
predicted  13.0 actual =  [13.]
predicted  13.0 actual =  [13.]
predicted  19.0 actual =  [19.]
predicted  13.0 actual =  [13.]
predicted  37.0 actual =  [37.]
predicted  13.0 actual =  [13.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  50.0 actual =  [50.]
predicted  5.0 actual =  [5.]
predicted  37.0 actual =  [37.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  13.0 actual =  [13.]
predicted  13.0 actual =  [13.]
predicted  13.0 actual =  [13.]
predicted  5.0 actual =  [5.]
predicted  26.0 actual =  [26.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  75.0 actual =  [75.]
predicted  43.0 actual =  [43.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicted  5.0 actual =  [5.]
predicte