# Template notebook

It's good to start with an introduction, to set the scene and introduce your audience to the data, and the problem you're solving as a team.

<br>

## Libraries
As always, we'll start by importing the necessary libraries.

In [2]:
# It's good practice to add comments to explain your code 
import numpy as np
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


**Import Data**

In [3]:
# Import data
full_df = pd.read_csv("data/corona_tested_individuals_ver_006.english.csv")

  full_df = pd.read_csv("data/corona_tested_individuals_ver_006.english.csv")


In [4]:
full_df

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,negative,,female,Other
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,negative,,female,Other
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,negative,,male,Other
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,negative,,female,Other
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,negative,,male,Other
...,...,...,...,...,...,...,...,...,...,...
278843,2020-03-11,0.0,0.0,0.0,0.0,0.0,negative,,,Other
278844,2020-03-11,0.0,0.0,0.0,0.0,0.0,negative,,,Other
278845,2020-03-11,0.0,0.0,0.0,0.0,0.0,positive,,,Contact with confirmed
278846,2020-03-11,0.0,0.0,0.0,0.0,0.0,other,,,Other


In [5]:
df_dropped = full_df.dropna(subset=['cough','fever','sore_throat','shortness_of_breath','head_ache'])

In [6]:
## Add "Unknown Category"
df_dropped['age_60_and_above'] = df_dropped['age_60_and_above'].fillna("Unknown")
df_dropped['gender'] = df_dropped['gender'].fillna("Unknown")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dropped['age_60_and_above'] = df_dropped['age_60_and_above'].fillna("Unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dropped['gender'] = df_dropped['gender'].fillna("Unknown")


In [7]:
## Remove test results that aren't positive or negative
mask = df_dropped['corona_result'] == "other"
df_clean = df_dropped[~mask]

In [8]:
print(df_clean.shape)

(274702, 10)


## Feature Engineering

In [9]:
df_clean['test_date'] = pd.to_datetime(df_clean['test_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['test_date'] = pd.to_datetime(df_clean['test_date'])


In [10]:
## Use Get Dummies from Pandas to onehotencode
df_clean = pd.get_dummies(df_clean, dtype=float)

In [11]:
df_clean

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result_negative,corona_result_positive,age_60_and_above_No,age_60_and_above_Unknown,age_60_and_above_Yes,gender_Unknown,gender_female,gender_male,test_indication_Abroad,test_indication_Contact with confirmed,test_indication_Other
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278842,2020-03-11,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
278843,2020-03-11,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
278844,2020-03-11,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
278845,2020-03-11,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [12]:
## Combine the "high bias" features into a single sum
df_clean['High_bias_features_sum'] = df_clean['sore_throat'] + df_clean['shortness_of_breath'] + df_clean['head_ache']

For now we'll keep the individual features in there too, but we might want to drop them later. 

In [13]:
# This is our clean and processed full data set 
df_clean.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result_negative,corona_result_positive,age_60_and_above_No,age_60_and_above_Unknown,age_60_and_above_Yes,gender_Unknown,gender_female,gender_male,test_indication_Abroad,test_indication_Contact with confirmed,test_indication_Other,High_bias_features_sum
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [14]:
## Next define our X and y before moving to train_test_split stage

# X is everything except the results of the test (we'll use date for stratification then drop)
X = df_clean.drop(['corona_result_negative','corona_result_positive'], axis=1)

# y is a boolean of whether the test is positive
y = df_clean['corona_result_positive']

In [15]:
X.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above_No,age_60_and_above_Unknown,age_60_and_above_Yes,gender_Unknown,gender_female,gender_male,test_indication_Abroad,test_indication_Contact with confirmed,test_indication_Other,High_bias_features_sum
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Train Test Split

In [1]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify = X['test_date'])

In [18]:
## Now we've stratified by date we can drop it
X_train = X_train.drop(['test_date'], axis=1)
X_test = X_test.drop(['test_date'], axis=1)