In [54]:
# Import Modules

In [55]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime


In [56]:
data_2 = pd.read_csv('c1data.csv')
data_2.head().T

Unnamed: 0,0,1,2,3,4
Age,23,47,47,28,61
Sex,F,M,M,F,F
BP,HIGH,LOW,LOW,NORMAL,LOW
Cholesterol,HIGH,HIGH,HIGH,HIGH,HIGH
Na_to_K,25.355,13.093,10.114,7.798,18.043
Drug,drugY,drugC,drugC,drugX,drugY


In [57]:
data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [58]:
drop = ['Age']
data_3 = data_2.drop(drop, axis=1)

In [59]:
data_3.head()

Unnamed: 0,Sex,BP,Cholesterol,Na_to_K,Drug
0,F,HIGH,HIGH,25.355,drugY
1,M,LOW,HIGH,13.093,drugC
2,M,LOW,HIGH,10.114,drugC
3,F,NORMAL,HIGH,7.798,drugX
4,F,LOW,HIGH,18.043,drugY


In [60]:
# One Hot Encode Catergorical Features

In [61]:
data_4 = pd.get_dummies(data_3, columns=['Sex', 'BP', 'Cholesterol', 'Drug'])
print(data_4)

     Na_to_K  Sex_F  Sex_M  BP_HIGH  BP_LOW  BP_NORMAL  Cholesterol_HIGH  \
0     25.355      1      0        1       0          0                 1   
1     13.093      0      1        0       1          0                 1   
2     10.114      0      1        0       1          0                 1   
3      7.798      1      0        0       0          1                 1   
4     18.043      1      0        0       1          0                 1   
..       ...    ...    ...      ...     ...        ...               ...   
195   11.567      1      0        0       1          0                 1   
196   12.006      0      1        0       1          0                 1   
197    9.894      0      1        0       0          1                 1   
198   14.020      0      1        0       0          1                 0   
199   11.349      1      0        0       1          0                 0   

     Cholesterol_NORMAL  Drug_drugA  Drug_drugB  Drug_drugC  Drug_drugX  \
0           

In [63]:
# Split Data into training and testing

In [64]:
len(data_4) * .7, len(data_4) * .3

(140.0, 60.0)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(data_4.drop(columns='Na_to_K'), 
                                                    data_4.Na_to_K, test_size=0.3, 
                                                    random_state=47)

In [66]:
X_train.shape, X_test.shape

((140, 12), (60, 12))

In [67]:
y_train.shape, y_test.shape

((140,), (60,))

In [68]:
List = []
Train = X_train[List]
Test = X_test[List]
X_train.drop(columns=List, inplace=True)
X_test.drop(columns=List, inplace=True)
X_train.shape, X_test.shape

((140, 12), (60, 12))

In [69]:
X_train.dtypes

Sex_F                 uint8
Sex_M                 uint8
BP_HIGH               uint8
BP_LOW                uint8
BP_NORMAL             uint8
Cholesterol_HIGH      uint8
Cholesterol_NORMAL    uint8
Drug_drugA            uint8
Drug_drugB            uint8
Drug_drugC            uint8
Drug_drugX            uint8
Drug_drugY            uint8
dtype: object

In [70]:
X_test.dtypes

Sex_F                 uint8
Sex_M                 uint8
BP_HIGH               uint8
BP_LOW                uint8
BP_NORMAL             uint8
Cholesterol_HIGH      uint8
Cholesterol_NORMAL    uint8
Drug_drugA            uint8
Drug_drugB            uint8
Drug_drugC            uint8
Drug_drugX            uint8
Drug_drugY            uint8
dtype: object

In [71]:
t_mean = y_train.mean()
t_mean

16.05714285714285

In [72]:
dreg = DummyRegressor(strategy='mean')
dreg.fit(X_train, y_train)
dreg.constant_

array([[16.05714286]])

In [73]:
#Standard Scaler to transform numerical features

In [74]:
data_ = data_4
scaler = StandardScaler()
scaler.fit(data_)
print("Na_to_K", scaler.mean_)

Na_to_K [16.084485  0.48      0.52      0.385     0.32      0.295     0.515
  0.485     0.115     0.08      0.08      0.27      0.455   ]


In [75]:
tdata = scaler.transform(data_)
print("Transformed data:\n", tdata)

Transformed data:
 [[ 1.28652212  1.040833   -1.040833   ... -0.29488391 -0.60816364
   1.0944415 ]
 [-0.4151454  -0.96076892  0.96076892 ...  3.39116499 -0.60816364
  -0.91370804]
 [-0.82855818 -0.96076892  0.96076892 ...  3.39116499 -0.60816364
  -0.91370804]
 ...
 [-0.85908883 -0.96076892  0.96076892 ... -0.29488391  1.64429429
  -0.91370804]
 [-0.28650033 -0.96076892  0.96076892 ... -0.29488391  1.64429429
  -0.91370804]
 [-0.6571702   1.040833   -1.040833   ... -0.29488391  1.64429429
  -0.91370804]]
