## Problem statement:
Build a model that determines whether or not the patient's symptoms indicate that the patient has hypothyroid.

## Metric for success

In [21]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score

In [22]:
#loading our datasets
df = pd.read_csv(r'C:\Users\Gideon Kipkorir\Desktop\colab\Datasets\hypothyroid.csv')
df.head()

Unnamed: 0,status,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
0,hypothyroid,72,M,f,f,f,f,f,f,f,...,y,0.6,y,15,y,1.48,y,10,n,?
1,hypothyroid,15,F,t,f,f,f,f,f,f,...,y,1.7,y,19,y,1.13,y,17,n,?
2,hypothyroid,24,M,f,f,f,f,f,f,f,...,y,0.2,y,4,y,1.0,y,0,n,?
3,hypothyroid,24,F,f,f,f,f,f,f,f,...,y,0.4,y,6,y,1.04,y,6,n,?
4,hypothyroid,77,M,f,f,f,f,f,f,f,...,y,1.2,y,57,y,1.28,y,44,n,?


## Eploratory Data Analysis

In [23]:
#columns and rows
df.shape

(3163, 26)

In [24]:
#columns in our dataset
df.columns

Index(['status', 'age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'thyroid_surgery', 'query_hypothyroid',
       'query_hyperthyroid', 'pregnant', 'sick', 'tumor', 'lithium', 'goitre',
       'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured', 'TT4',
       'T4U_measured', 'T4U', 'FTI_measured', 'FTI', 'TBG_measured', 'TBG'],
      dtype='object')

In [25]:
#checking for null/ missing values
df.isnull().sum()

status                       0
age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
thyroid_surgery              0
query_hypothyroid            0
query_hyperthyroid           0
pregnant                     0
sick                         0
tumor                        0
lithium                      0
goitre                       0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          0
FTI_measured                 0
FTI                          0
TBG_measured                 0
TBG                          0
dtype: int64

In [26]:
#data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3163 entries, 0 to 3162
Data columns (total 26 columns):
status                       3163 non-null object
age                          3163 non-null object
sex                          3163 non-null object
on_thyroxine                 3163 non-null object
query_on_thyroxine           3163 non-null object
on_antithyroid_medication    3163 non-null object
thyroid_surgery              3163 non-null object
query_hypothyroid            3163 non-null object
query_hyperthyroid           3163 non-null object
pregnant                     3163 non-null object
sick                         3163 non-null object
tumor                        3163 non-null object
lithium                      3163 non-null object
goitre                       3163 non-null object
TSH_measured                 3163 non-null object
TSH                          3163 non-null object
T3_measured                  3163 non-null object
T3                           3163 non-null 

## Feature Engineering

In [27]:
#replacing '?' with 0. "?" was interpreted to stand for unavailability of the feature being tested
df.replace(to_replace = '?', value = np.nan, inplace = True)
df.sample(8)

Unnamed: 0,status,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
109,hypothyroid,78,F,f,f,f,f,t,f,f,...,y,0.9,y,50.0,y,0.84,y,60.0,n,
2333,negative,19,F,f,f,f,f,f,f,f,...,n,,n,,n,,n,,y,65.0
246,negative,27,F,f,f,f,f,f,f,f,...,y,1.8,y,102.0,y,1.09,y,94.0,n,
343,negative,87,F,f,f,f,f,f,f,f,...,n,,y,135.0,y,0.8,y,170.0,n,
106,hypothyroid,50,F,f,f,f,f,f,f,f,...,y,0.2,y,10.0,y,1.13,y,9.0,n,
2752,negative,15,F,f,f,f,f,f,f,f,...,n,,n,,n,,n,,y,23.0
2503,negative,20,F,f,f,f,f,f,t,f,...,n,,n,,n,,n,,y,108.0
1763,negative,87,F,f,f,f,f,f,f,f,...,y,1.1,y,105.0,y,1.02,y,103.0,n,


In [31]:
df.isnull().sum()

status                          0
age                           446
sex                            73
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
thyroid_surgery                 0
query_hypothyroid               0
query_hyperthyroid              0
pregnant                        0
sick                            0
tumor                           0
lithium                         0
goitre                          0
TSH_measured                    0
TSH                           468
T3_measured                     0
T3                            695
TT4_measured                    0
TT4                           249
T4U_measured                    0
T4U                           248
FTI_measured                    0
FTI                           247
TBG_measured                    0
TBG                          2903
dtype: int64

In [32]:
#Replacing missing values

df['age'] = df.age.fillna(df.age.mean())
df[['TSH','T3','TT4','T4U','FTI', 'TBG','TBG' ]] = df[['TSH','T3','TT4','T4U','FTI', 'TBG','TBG' ]].fillna(0)
# imputed that way(above) with an understanding  that Random trees works fine despite missing values
df.sex = df.sex.fillna(method = 'bfill')
df.isnull().sum()

status                       0
age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
thyroid_surgery              0
query_hypothyroid            0
query_hyperthyroid           0
pregnant                     0
sick                         0
tumor                        0
lithium                      0
goitre                       0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          0
FTI_measured                 0
FTI                          0
TBG_measured                 0
TBG                          0
dtype: int64

In [33]:
#converting numerical data into inergers and float datatypes
df[['age', 'T3','TT4','T4U', 'FTI', 'TBG','TSH']] = df[['age', 'T3','TT4','T4U', 'FTI', 'TBG','TSH']].astype(float)
df['age'] = df['age'].astype(int)


In [34]:
#Getting categorical features
df.dtypes[df.dtypes == 'object'].index

Index(['status', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'thyroid_surgery', 'query_hypothyroid',
       'query_hyperthyroid', 'pregnant', 'sick', 'tumor', 'lithium', 'goitre',
       'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
       'FTI_measured', 'TBG_measured'],
      dtype='object')

In [35]:
#label enconding categorical data into numerical
le = preprocessing.LabelEncoder()


df[['status', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'thyroid_surgery', 'query_hypothyroid',
       'query_hyperthyroid', 'pregnant', 'sick', 'tumor', 'lithium', 'goitre',
       'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
       'FTI_measured', 'TBG_measured']] = df[['status', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'thyroid_surgery', 'query_hypothyroid',
       'query_hyperthyroid', 'pregnant', 'sick', 'tumor', 'lithium', 'goitre',
       'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
       'FTI_measured', 'TBG_measured']].apply(le.fit_transform)



In [36]:
df.head()

Unnamed: 0,status,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
0,0,72,1,0,0,0,0,0,0,0,...,1,0.6,1,15.0,1,1.48,1,10.0,0,0.0
1,0,15,0,1,0,0,0,0,0,0,...,1,1.7,1,19.0,1,1.13,1,17.0,0,0.0
2,0,24,1,0,0,0,0,0,0,0,...,1,0.2,1,4.0,1,1.0,1,0.0,0,0.0
3,0,24,0,0,0,0,0,0,0,0,...,1,0.4,1,6.0,1,1.04,1,6.0,0,0.0
4,0,77,1,0,0,0,0,0,0,0,...,1,1.2,1,57.0,1,1.28,1,44.0,0,0.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3163 entries, 0 to 3162
Data columns (total 26 columns):
status                       3163 non-null int32
age                          3163 non-null int32
sex                          3163 non-null int32
on_thyroxine                 3163 non-null int32
query_on_thyroxine           3163 non-null int32
on_antithyroid_medication    3163 non-null int32
thyroid_surgery              3163 non-null int32
query_hypothyroid            3163 non-null int32
query_hyperthyroid           3163 non-null int32
pregnant                     3163 non-null int32
sick                         3163 non-null int32
tumor                        3163 non-null int32
lithium                      3163 non-null int32
goitre                       3163 non-null int32
TSH_measured                 3163 non-null int32
TSH                          3163 non-null float64
T3_measured                  3163 non-null int32
T3                           3163 non-null float64
TT4_mea

In [38]:
#dividing into dependent and independent variables
y = df.status.values
test_features = df.columns.to_list()
test_features.remove('status')
x = df[test_features].values


In [49]:
#spliting our dataset into train and slit
x_train,x_test,y_train,y_test = train_test_split(x,y , test_size = 0.2, random_state = 40)
# train_data, test_data = train_test_split(df, test_size=0.2,stratify=df['status'])
# train_data['status'].value_counts()

# test_data['status'].value_counts()

## Modelling using Random Forests Approach

In [50]:
# using grid search tom set our parameters

param_grid = {'min_samples_split' : [3,4,6,10,20,24], 'n_estimators' : [100,200,300,400], 'max_depth': [4,5,7,10,11,12] }
search = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5,n_jobs = 1,scoring='accuracy', refit =True)
search.fit(x_train,y_train)
search.best_params_

{'max_depth': 11, 'min_samples_split': 6, 'n_estimators': 100}

In [51]:
ranfc = RandomForestClassifier(n_estimators =100, max_depth = 11, min_samples_split = 6)
ranfc = ranfc.fit(x_train, y_train)

In [52]:
y_pred = ranfc.predict(x_test)
print( accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

0.9889415481832543
[[ 23   5]
 [  2 603]]


**Observation:** in general, our model had 98.9% accuracy which encouraging considering it is the baseline model.However, in 5 out of 28 were incorrect predictions(approx- 80% accuracy) which is a big concern for this category, while 2 out of 605 wrong predictions is fairly good prediction (approx - 99.7% accuracy).

Visualization of the random forest


In [55]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image 
import pydotplus
from sklearn import tree
# Create DOT df
dot_df = StringIO()
tree = ranfc.estimators_[50]

export_graphviz(tree, out_file=dot_df,  
                filled=True, rounded=True,
                special_characters=True,feature_names = test_features)
# Draw graph
graph = pydotplus.graph_from_dot_df(dot_df.getvalue())  
graph.write_png('hypothyroid.png')
Image(graph.create_png())


AttributeError: module 'pydotplus' has no attribute 'graph_from_dot_df'

## Modelling using Ada Boost Classifier Approach

In [53]:
from sklearn.ensemble import AdaBoostClassifier
# using grid search tom set our parameters
param_grid = {'n_estimators': [70,100,200,300,400]}
search = GridSearchCV(AdaBoostClassifier(), param_grid, refit =True, cv = 5, n_jobs = 1,scoring='accuracy')
search.fit(x_train,y_train)
search.best_params_

        

{'n_estimators': 70}

In [54]:
adb = AdaBoostClassifier( n_estimators = 70)
adb = adb.fit(x_train, y_train)

y_pred = adb.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

0.990521327014218
[[ 24   4]
 [  2 603]]


## Modelling using Gradient Boost Classifier Approach

In [55]:
#importing Gradient boosting Classifier

from sklearn.ensemble import GradientBoostingClassifier
param_grid = {'n_estimators': [70, 100, 150, 200, 250],'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5], 'max_depth':[3, 4, 5, 6, 10]}
search =GridSearchCV(GradientBoostingClassifier(), param_grid, n_jobs =1, cv=5, refit =True,scoring='accuracy')
search.fit(x_train,y_train)
print(search.best_params_)


{'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 150}


In [56]:
xgb = GradientBoostingClassifier(n_estimators = 150, learning_rate = 0.2, max_depth = 6)
xgb = xgb.fit(x_train, y_train)

In [57]:
y_pred = xgb.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))

[[ 25   3]
 [  3 602]]
0.990521327014218


in general the model performed fairly good, with 98% accuracy. In the first category 3 0ut of 28 were wrong predictions(appro-89% accuracy) which is not so good, while in the second category, 3 0ut of 602 were wrong which is fairly good(99.6% accuracy). 

## Modelling using Support Vector Machine
### Approach is to approach the each of the kernel first then, test the best setting in each of the kernel
**Finding the best parameter settings in the linear kernel**

In [58]:
# setting svc parameters  using GridSearchCV

param_grid = { 'kernel': ['linear'],'C': (np.arange(0.1,1,0.1)) , 'gamma': [0.01,0.02,0.03,0.04,0.05]}
search = GridSearchCV(SVC(),param_grid, cv = 5 , n_jobs =1,scoring='accuracy', refit = True)
search.fit(x_train, y_train)
print(search.best_params_)

              

{'C': 0.9, 'gamma': 0.01, 'kernel': 'linear'}


**Finding best parameter setting using Radial Basis Function kernel**

In [62]:
# setting svc parameters  using GridSearchCV
param_grid = {'C':(np.arange(0.1,1,0.1)),'kernel': ['rbf'],'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05] }
search = GridSearchCV(SVC(),param_grid, cv = 5 , n_jobs =1,scoring='accuracy', refit = True)
search.fit(x_train, y_train)
print(search.best_params_)

{'C': 0.9, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}


**Finding the best parameter settings using polynomial Kernel**

In [63]:
# setting svc parameters  using GridSearchCV
param_grid = {'kernel': ['poly'],'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05], 'C':(np.arange(0.1,0.5,0.1))}
search = GridSearchCV(SVC(),param_grid, cv = 5 , n_jobs =1,scoring='accuracy', refit = True)
search.fit(x_train, y_train)
print(search.best_params_)

{'C': 0.1, 'degree': 2, 'gamma': 0.02, 'kernel': 'poly'}


**Comparing the best of the best parameters from each kernel**

In [64]:
# setting svc parameters  using GridSearchCV
param_grid = {'kernel': ['linear'] ,'gamma':[0.01], 'C':[0.9],'kernel': ['rbf'],'degree': [2] ,'gamma':[0.01], 'C':[0.09],
             'kernel': ['poly'],'degree': [2] ,'gamma':[0.02], 'C':[0.1]}
search = GridSearchCV(SVC(),param_grid, cv = 5 , n_jobs =1,scoring='accuracy', refit = True)
search.fit(x_train, y_train)
print(search.best_params_)

{'C': 0.1, 'degree': 2, 'gamma': 0.02, 'kernel': 'poly'}


**The poly kernel parameters turn out to be the best , so we will apply use it to build our model.**

In [67]:
model_svm = SVC(kernel='poly', C = 0.1, gamma =0.02, degree = 2 )
model_svm = model_svm.fit(x_train, y_train)

In [68]:
#predicting 
y_pred = model_svm.predict(x_test)
print('Accuracy: ',accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy:  0.9747235387045814
[[ 18  10]
 [  6 599]]


**Observations:** The pollynomial Kernel as a poor accuracy of approxiamtely 97%, however we noted that it was able to predict one category of the feature well(appro - 99% accuracy) and performed dismally in predicting(appro - 64% accuracy) the other category.

***suggested improvements:*** the dataset provided is insuffucient  to address our problem.Most of the models  sufferred from imbalanced data.Hence, it was able to predict one category failrly well and performed poorly  in predicting the other. To improve the accuracy we will have to balance our data first before training the models which will need additional data.
