In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv('apollo_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,smoker,region,viral load,severity level,hospitalization charges
0,0,19,female,yes,southwest,9.3,0,42212
1,1,18,male,no,southeast,11.26,1,4314
2,2,28,male,no,southeast,11.0,3,11124
3,3,33,male,no,northwest,7.57,0,54961
4,4,32,male,no,northwest,9.63,0,9667


In [9]:
# drop unnecessary column 1
df.drop(columns='Unnamed: 0', inplace= True)
df.head()

Unnamed: 0,age,sex,smoker,region,viral load,severity level,hospitalization charges
0,19,female,yes,southwest,9.3,0,42212
1,18,male,no,southeast,11.26,1,4314
2,28,male,no,southeast,11.0,3,11124
3,33,male,no,northwest,7.57,0,54961
4,32,male,no,northwest,9.63,0,9667


In [10]:
# data cleaning process starts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      1338 non-null   int64  
 1   sex                      1338 non-null   object 
 2   smoker                   1338 non-null   object 
 3   region                   1338 non-null   object 
 4   viral load               1338 non-null   float64
 5   severity level           1338 non-null   int64  
 6   hospitalization charges  1338 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 73.3+ KB


In [11]:
df.describe()

Unnamed: 0,age,viral load,severity level,hospitalization charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,10.221233,1.094918,33176.058296
std,14.04996,2.032796,1.205493,30275.029296
min,18.0,5.32,0.0,2805.0
25%,27.0,8.7625,0.0,11851.0
50%,39.0,10.13,1.0,23455.0
75%,51.0,11.5675,2.0,41599.5
max,64.0,17.71,5.0,159426.0


In [12]:
# check for null values
df.isna().sum()

age                        0
sex                        0
smoker                     0
region                     0
viral load                 0
severity level             0
hospitalization charges    0
dtype: int64

In [14]:
# business problem no 1
# we first need to get dummies

df['sex'].unique()

array(['female', 'male'], dtype=object)

In [15]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [16]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [17]:
df_dummies = pd.get_dummies(df, columns=['sex', 'smoker', 'region'])
df_dummies.head()

Unnamed: 0,age,viral load,severity level,hospitalization charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,9.3,0,42212,True,False,False,True,False,False,False,True
1,18,11.26,1,4314,False,True,True,False,False,False,True,False
2,28,11.0,3,11124,False,True,True,False,False,False,True,False
3,33,7.57,0,54961,False,True,True,False,False,True,False,False
4,32,9.63,0,9667,False,True,True,False,False,True,False,False


In [19]:
# we will now change these into 0 or 1

df_dummies.columns

Index(['age', 'viral load', 'severity level', 'hospitalization charges',
       'sex_female', 'sex_male', 'smoker_no', 'smoker_yes', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [20]:
dummy_columns = ['sex_female', 'sex_male', 'smoker_no', 'smoker_yes', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest']

for col in dummy_columns:
    df_dummies[col] = df_dummies[col].astype(int)

df_dummies.head()

Unnamed: 0,age,viral load,severity level,hospitalization charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,9.3,0,42212,1,0,0,1,0,0,0,1
1,18,11.26,1,4314,0,1,1,0,0,0,1,0
2,28,11.0,3,11124,0,1,1,0,0,0,1,0
3,33,7.57,0,54961,0,1,1,0,0,1,0,0
4,32,9.63,0,9667,0,1,1,0,0,1,0,0
