### Importing Dependencies

In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading Data

In [97]:
data = pd.read_csv('/content/Thyroid_Diff.csv')

### Data Analysis

In [98]:
data.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [99]:
data.isnull().sum()

Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64

In [100]:
data.shape

(383, 17)

### Data Preprocessing

In [101]:
data.columns

Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')

In [102]:
data["Gender"].unique()

array(['F', 'M'], dtype=object)

In [103]:
data["Gender"] = (data["Gender"] == "M").astype(int)

In [104]:
data["Smoking"] = (data["Smoking"] == "Yes").astype(int)
data["Hx Smoking"] = (data["Hx Smoking"] == "Yes").astype(int)
data["Hx Radiothreapy"] = (data["Hx Radiothreapy"] == "Yes").astype(int)
data["Adenopathy"] = (data["Adenopathy"] == "Yes").astype(int)
data["Recurred"] = (data["Recurred"] == "Yes").astype(int)

In [105]:
data

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,0,0,0,0,Euthyroid,Single nodular goiter-left,0,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,0
1,34,0,0,1,0,Euthyroid,Multinodular goiter,0,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,0
2,30,0,0,0,0,Euthyroid,Single nodular goiter-right,0,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,0
3,62,0,0,0,0,Euthyroid,Single nodular goiter-right,0,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,0
4,62,0,0,0,0,Euthyroid,Multinodular goiter,0,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,72,1,1,1,1,Euthyroid,Single nodular goiter-right,0,Papillary,Uni-Focal,High,T4b,N1b,M1,IVB,Biochemical Incomplete,1
379,81,1,1,0,1,Euthyroid,Multinodular goiter,0,Papillary,Multi-Focal,High,T4b,N1b,M1,IVB,Structural Incomplete,1
380,72,1,1,1,0,Euthyroid,Multinodular goiter,0,Papillary,Multi-Focal,High,T4b,N1b,M1,IVB,Structural Incomplete,1
381,61,1,1,1,1,Clinical Hyperthyroidism,Multinodular goiter,0,Hurthel cell,Multi-Focal,High,T4b,N1b,M0,IVA,Structural Incomplete,1


In [106]:
data["Thyroid Function"].unique()

array(['Euthyroid', 'Clinical Hyperthyroidism', 'Clinical Hypothyroidism',
       'Subclinical Hyperthyroidism', 'Subclinical Hypothyroidism'],
      dtype=object)

In [107]:
from sklearn.preprocessing import LabelEncoder

In [108]:
le = LabelEncoder()

In [109]:
data["Thyroid Function"] = le.fit_transform(data["Thyroid Function"])

In [110]:
data["Physical Examination"].unique()

array(['Single nodular goiter-left', 'Multinodular goiter',
       'Single nodular goiter-right', 'Normal', 'Diffuse goiter'],
      dtype=object)

In [111]:
data["Physical Examination"] = le.fit_transform(data["Physical Examination"])

In [112]:
data["Pathology"].unique()

array(['Micropapillary', 'Papillary', 'Follicular', 'Hurthel cell'],
      dtype=object)

In [113]:
data["Pathology"] = le.fit_transform(data["Pathology"])

In [114]:
data["Focality"] = le.fit_transform(data["Focality"])

In [115]:
cols = ["Risk", "T", "N", "M", "Stage",	"Response"]
for col in cols:
  data[col] = le.fit_transform(data[col])

In [116]:
data

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,0,0,0,0,2,3,0,2,1,2,0,0,0,0,2,0
1,34,0,0,1,0,2,1,0,2,1,2,0,0,0,0,1,0
2,30,0,0,0,0,2,4,0,2,1,2,0,0,0,0,1,0
3,62,0,0,0,0,2,4,0,2,1,2,0,0,0,0,1,0
4,62,0,0,0,0,2,1,0,2,0,2,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,72,1,1,1,1,2,4,0,3,1,0,6,2,1,4,0,1
379,81,1,1,0,1,2,1,0,3,0,0,6,2,1,4,3,1
380,72,1,1,1,0,2,1,0,3,0,0,6,2,1,4,3,1
381,61,1,1,1,1,0,1,0,1,0,0,6,2,0,3,3,1


In [117]:
data["Pathology"].unique()

array([2, 3, 0, 1])

In [118]:
X = data.drop(columns = ["Recurred"])
y = data["Recurred"]

In [119]:
from sklearn.preprocessing import StandardScaler

In [120]:
ss = StandardScaler()

In [121]:
X = ss.fit_transform(X)
X

array([[-0.91743929, -0.47703679, -0.38302296, ..., -0.22206996,
        -0.31442613,  0.46441951],
       [-0.45431469, -0.47703679, -0.38302296, ..., -0.22206996,
        -0.31442613, -0.62682387],
       [-0.71895732, -0.47703679, -0.38302296, ..., -0.22206996,
        -0.31442613, -0.62682387],
       ...,
       [ 2.05979025,  2.09627436,  2.61080955, ...,  4.50308536,
         4.86515272,  1.55566289],
       [ 1.33202303,  2.09627436,  2.61080955, ..., -0.22206996,
         3.57025801,  1.55566289],
       [ 1.72898697,  2.09627436,  2.61080955, ..., -0.22206996,
         3.57025801,  1.55566289]])

### Data Splitting

In [122]:
from sklearn.model_selection import train_test_split

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [124]:
from sklearn.metrics import accuracy_score

In [125]:
results = {}

### Logistic Regression

In [126]:
from sklearn.linear_model import LogisticRegression

In [127]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [128]:
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


In [129]:
results["lr"] = {
    "y_test": y_test,
    "y_pred": y_pred,
    "acc": accuracy
}

### KNN

In [130]:
from sklearn.neighbors import KNeighborsClassifier

In [131]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [132]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [133]:
results["knn"] = {
    "y_test": y_test,
    "y_pred": y_pred,
    "acc": accuracy
}

### SVM

In [134]:
from sklearn.svm import SVC

In [135]:
svm = SVC()
svm.fit(X_train, y_train)

In [136]:
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [137]:
results["svm"] = {
    "y_test": y_test,
    "y_pred": y_pred,
    "acc": accuracy
}

### Decision Tree

In [138]:
from sklearn.tree import DecisionTreeClassifier

In [139]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [140]:
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [141]:
results["dt"] = {
    "y_test": y_test,
    "y_pred": y_pred,
    "acc": accuracy
}

### Random Forest

In [142]:
from sklearn.ensemble import RandomForestClassifier

In [143]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

In [144]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [145]:
results["rf"] = {
    "y_test": y_test,
    "y_pred": y_pred,
    "acc": accuracy
}

### Gradient Boosting

In [146]:
from sklearn.ensemble import GradientBoostingClassifier

In [147]:
gb = GradientBoostingClassifier(n_estimators = 100)
gb.fit(X_train, y_train)

In [148]:
y_pred = gb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [149]:
results["gb"] = {
    "y_test": y_test,
    "y_pred": y_pred,
    "acc": accuracy
}

### Saving the Data

In [150]:
import pickle

In [151]:
model_names = ["lr", "knn", "svm", "dt", "rf", "gb"]
models = [lr, knn, svm, dt, rf, gb]
for name, model in zip(model_names, models):
  with open(f"{name}.pkl", "wb") as file:
    pickle.dump(model, file)
  file.close()

In [152]:
with open("results.pkl", "wb") as file:
  pickle.dump(results, file)
file.close()