In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
train_data = pd.read_csv('/Users/devirughani/Desktop/Ironhack/Week_7/Day_5/Ironhack_Modelling/Data/train.csv')
test_data=pd.read_csv('/Users/devirughani/Desktop/Ironhack/Week_7/Day_5/Ironhack_Modelling/Data/test_no_class.csv')

In [6]:
train_data.dtypes

AGE                 int64
SEX                object
STEROID            object
ANTIVIRALS          int64
FATIGUE             int64
MALAISE             int64
ANOREXIA            int64
LIVER BIG          object
LIVER FIRM         object
SPLEEN PALPABLE    object
SPIDERS            object
ASCITES            object
VARICES            object
BILIRUBIN          object
ALK PHOSPHATE      object
SGOT               object
ALBUMIN            object
PROTIME            object
HISTOLOGY           int64
Class              object
dtype: object

In [7]:
train_data.head(20)

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class
0,48,meale,1,2,1,1,2,2,1,2,1,1,1,4.8,123,157,2.7,31,2,DIE
1,51,Male,2,2,1,2,2,2,1,1,1,2,1,1.0,?,20,3.0,63,2,LIVE
2,40,m,1,2,1,2,2,2,1,2,2,2,2,0.6,62,166,4.0,63,1,LIVE
3,25,m,2,2,1,2,2,1,1,1,1,1,1,1.3,181,181,4.5,57,2,LIVE
4,34,M,1,2,1,2,2,1,1,2,1,2,2,1.0,72,46,4.4,57,1,LIVE
5,40,m,1,1,1,1,1,1,1,2,2,2,2,0.6,40,69,4.2,67,2,LIVE
6,52,M,1,1,2,2,2,2,2,2,2,2,2,0.7,75,55,4.0,21,1,LIVE
7,31,maled,2,2,2,2,2,2,2,2,2,2,2,1.0,85,20,4.0,100,1,LIVE
8,51,male,1,1,1,1,2,2,2,2,2,2,2,1.0,78,58,4.6,52,1,LIVE
9,62,F,2,2,1,1,2,2,1,2,1,2,2,1.3,141,156,3.9,58,1,LIVE


In [17]:
# Get rid of rows with lots of NAs
ml = list(train_data[train_data['SPIDERS'].notna()].index)
train_data = train_data.loc[ml,:]

In [18]:
y = train_data['Class']
X = train_data.drop(['Class'], axis = 1)

In [19]:
def preprocessing (data):
    # NUll values
    data = data.replace('?', np.NaN)
    
    
    
    # Cleaning Gender Column
    def clean_gender(x):
        if x in ['m','M', 'meale', 'Male', 'male', 'maled']:
            return '0'
        else:
            return '1'
    
    data['SEX'] = data['SEX'].apply(clean_gender)
    
    data = data.apply(pd.to_numeric)
    
    # Manually Replacing NAs for certain columns
    
    # Steroid
    data['STEROID'] = data['STEROID'].fillna(np.random.choice([1,2]))
    
    # Liver Big
    liver_new = []

    for val in data['LIVER BIG']:
        if pd.isna(val)==True:
            val_new = np.random.choice([1,2])
            liver_new.append(val_new)
        else:
            liver_new.append(val)

    data['LIVER BIG']=liver_new
    
    #Liver Firm
    liver_new2 = []

    for val in data['LIVER FIRM']:
        if pd.isna(val)==True:
            val_new = np.random.choice([1,2])
            liver_new2.append(val_new)
        else:
            liver_new2.append(val)


    data['LIVER FIRM']=liver_new2
    
    # K-NN Imputation
    from sklearn.impute import KNNImputer

    imputer = KNNImputer(missing_values=np.nan)
    imputed_data = imputer.fit_transform(data)  
    df_new = pd.DataFrame(imputed_data)
    df_new.columns = data.columns
    
   
    return df_new

In [23]:
X=preprocessing(X)
X

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,48.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,4.8,123.0,157.0,2.7,31.0,2.0
1,51.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,79.6,20.0,3.0,63.0,2.0
2,40.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.6,62.0,166.0,4.0,63.0,1.0
3,25.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.3,181.0,181.0,4.5,57.0,2.0
4,34.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,72.0,46.0,4.4,57.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,23.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.3,194.0,150.0,4.1,90.0,1.0
98,38.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.4,243.0,49.0,3.8,90.0,2.0
99,36.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,0.8,85.0,44.0,4.2,85.0,1.0
100,32.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,55.0,45.0,4.1,56.0,1.0


## Train -Test Split

In [24]:
X_train_prescaled, X_test_prescaled, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## MinMax Scaler

In [64]:
from sklearn.preprocessing import MinMaxScaler
import pickle
scaler = MinMaxScaler()
scaler.fit(X_train_prescaled)
filename = "/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Model_2/scaler2.pickle"
with open(filename, "wb") as f:
    pickle.dump(scaler,f)
X_train_normalized_np = scaler.transform(X_train_prescaled)
X_test_normalized_np  = scaler.transform(X_test_prescaled)

X_train_normalized_df=pd.DataFrame(X_train_normalized_np , columns=X_train_prescaled.columns, index=X_train_prescaled.index)
X_test_normalized_df=pd.DataFrame(X_test_normalized_np, columns=X_test_prescaled.columns, index=X_test_prescaled.index)

## Upsampling Data

In [27]:
trainset = pd.concat([X_train_normalized_df, y_train], axis=1)
trainset['Class'].value_counts()

LIVE    64
DIE     17
Name: Class, dtype: int64

In [65]:
#Could use SMOTE instead - this generates new observations

category_die = trainset[trainset['Class']== 'DIE'].sample(len(trainset[trainset['Class']== 'LIVE']), replace=True)
print(category_die.shape)

category_live = trainset[trainset['Class']== 'LIVE' ]
trainset_new = pd.concat([category_die, category_live], axis = 0)  #concat normal df with upsampled df
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train_normalized = trainset_new.drop(['Class'], axis=1)
y_train = trainset_new['Class']
#data = data.reset_index(drop=True)
print(X_train_normalized.shape)


trainset_new['Class'].value_counts()

(64, 20)
(128, 19)


LIVE    64
DIE     64
Name: Class, dtype: int64

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score



model1 = DecisionTreeClassifier()
model2 = LogisticRegression()
model3 = KNeighborsClassifier()
model4= RandomForestClassifier()


model_pipeline = [model1, model2, model3, model4]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN', 'Random Forest']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_normalized, y_train, cv=5))
    scores[model_name] = mean_score
print(scores)


{'Decision Tree Regressor': 0.9301538461538461, 'Linear Regression': 0.8898461538461537, 'KNN': 0.8904615384615384, 'Random Forest': 0.960923076923077}


In [31]:
test_data_clean=preprocessing(test_data)
test_data_clean.head()

final_test_normalized = scaler.transform(test_data_clean)
final_test_normalized_df=pd.DataFrame(final_test_normalized, columns=test_data_clean.columns, index=test_data_clean.index)

## Random Forest 

In [32]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             random_state = 28)
clf.fit(X_train_normalized, y_train)


print("The accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train_normalized, y_train)))
y_pred_train = clf.predict(X_train_normalized)
display(y_train.value_counts())
display(confusion_matrix(y_train, y_pred_train))

#print("The accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(final_test_normalized_df, y_test)))

y_pred_test = clf.predict(final_test_normalized_df)

y_pred_test

The accuracy for the Random Forest in the TRAIN set is 0.89


DIE     64
LIVE    64
Name: Class, dtype: int64

array([[58,  6],
       [ 8, 56]])

array(['LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'DIE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'DIE', 'DIE', 'LIVE', 'DIE', 'LIVE', 'DIE', 'LIVE', 'DIE',
       'LIVE', 'DIE', 'LIVE', 'DIE', 'DIE', 'DIE', 'DIE', 'LIVE', 'LIVE',
       'LIVE', 'DIE', 'LIVE', 'DIE', 'LIVE', 'DIE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE'], dtype=object)

In [39]:
filename = "/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Model_2/random_forest_model2.pickle"
with open(filename, "wb") as f:
    pickle.dump(clf,f)

In [66]:
y_pred=pd.DataFrame(y_pred_test)
y_pred=y_pred.rename({0:'Class'}, axis=1)
y_pred.to_csv('/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Predictions/Robert_Devi_predictions2.csv')

## Gradient Boosting

In [50]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()

gb.fit(X_train_normalized,y_train)

print("The R2 of the model in the TRAIN set is: {:.2f}".format(gb.score(X_train_normalized,y_train)))
y_train_pred = gb.predict(X_train_normalized)


y_test_pred3  = gb.predict(final_test_normalized_df)

y_test_pred3=pd.DataFrame(y_test_pred3)

y_test_pred3.to_csv('/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Predictions/Robert_Devi_predictions3.csv')

The R2 of the model in the TRAIN set is: 1.00


In [45]:
X_train_normalized.shape

(128, 19)

In [44]:
y_train.shape

(128,)

In [52]:
filename = "/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Model_3/gradient_boost_model3.pickle"
with open(filename, "wb") as f:
    pickle.dump(gb,f)

## Grid Search

In [54]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150], #3*2*2*2*1=3*8=24 
    'min_samples_split': [2, 4], 
    'min_samples_leaf' : [1, 2],
    'max_depth':[3,5],
    'max_features': ['sqrt']
    }

gb2 = GradientBoostingClassifier(random_state=100)

grid_search = GridSearchCV(gb2, param_grid, cv=5,return_train_score=True,n_jobs=2, verbose = 2) 
#-1 means use all processorrs in computer but dangerous if you're not using Google collab
grid_search.fit(X_train_normalized,y_train)
grid_search.best_params_ #To check the best set of parameters returned

Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 150}

In [60]:
from sklearn.model_selection import cross_val_score

gb2 = GradientBoostingClassifier(random_state=100, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=150)
gb2.fit(X_train_normalized,y_train)
cross_val_scores = cross_val_score(gb2, X_train_normalized, y_train, cv=5)
print("The mean R2 of over the folds was {:.2f}".format(np.mean(cross_val_scores)))


The mean R2 of over the folds was 0.96


In [61]:
y_test_pred4= gb2.predict(final_test_normalized_df)
y_test_pred4=pd.DataFrame(y_test_pred4)
y_test_pred4.to_csv('/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Predictions/Robert_Devi_predictions4.csv')

In [68]:
filename = "/Users/devirughani/Desktop/IronHack/Week_7/Day_5/Ironhack_Modelling/Model_4/gradient_boost_model4.pickle"
with open(filename, "wb") as f:
    pickle.dump(gb2,f)