In [182]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [183]:
df = pd.read_csv("hypothyroid.csv")

In [184]:
#show first five rows 
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P


In [185]:
df.shape

(3772, 30)

In [186]:
df.columns

Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
       'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
       'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
       'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
       'FTI measured', 'FTI', 'TBG measured', 'TBG', 'referral source',
       'binaryClass'],
      dtype='object')

In [187]:
df.describe()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
count,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,...,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772
unique,94,3,2,2,2,2,2,2,2,2,...,2,242,2,147,2,235,1,1,5,2
top,59,F,f,f,f,f,f,f,f,f,...,t,?,t,?,t,?,f,?,other,P
freq,95,2480,3308,3722,3729,3625,3719,3719,3713,3538,...,3541,231,3385,387,3387,385,3772,3772,2201,3481


In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on thyroxine               3772 non-null   object
 3   query on thyroxine         3772 non-null   object
 4   on antithyroid medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid surgery            3772 non-null   object
 8   I131 treatment             3772 non-null   object
 9   query hypothyroid          3772 non-null   object
 10  query hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [189]:
df.isnull().sum()

age                          0
sex                          0
on thyroxine                 0
query on thyroxine           0
on antithyroid medication    0
sick                         0
pregnant                     0
thyroid surgery              0
I131 treatment               0
query hypothyroid            0
query hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH measured                 0
TSH                          0
T3 measured                  0
T3                           0
TT4 measured                 0
TT4                          0
T4U measured                 0
T4U                          0
FTI measured                 0
FTI                          0
TBG measured                 0
TBG                          0
referral source              0
binaryClass                  0
dtype: int64

In [190]:
df= df.replace("?", np.nan)

In [191]:
df.isna().sum()

age                             1
sex                           150
on thyroxine                    0
query on thyroxine              0
on antithyroid medication       0
sick                            0
pregnant                        0
thyroid surgery                 0
I131 treatment                  0
query hypothyroid               0
query hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH measured                    0
TSH                           369
T3 measured                     0
T3                            769
TT4 measured                    0
TT4                           231
T4U measured                    0
T4U                           387
FTI measured                    0
FTI                           385
TBG measured                    0
TBG                          3772
referral source                 0
binaryClass   

In [192]:
#drop the TBG column since all values are NULL
df.drop(columns="TBG", axis=1, inplace=True)

In [193]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,2.5,t,125,t,1.14,t,109.0,f,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,2.0,t,102,f,,f,,f,other,P
2,46,M,f,f,f,f,f,f,f,f,...,,t,109,t,0.91,t,120.0,f,other,P
3,70,F,t,f,f,f,f,f,f,f,...,1.9,t,175,f,,f,,f,other,P
4,70,F,f,f,f,f,f,f,f,f,...,1.2,t,61,t,0.87,t,70.0,f,SVI,P


In [194]:
df.isna().sum()

age                            1
sex                          150
on thyroxine                   0
query on thyroxine             0
on antithyroid medication      0
sick                           0
pregnant                       0
thyroid surgery                0
I131 treatment                 0
query hypothyroid              0
query hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH measured                   0
TSH                          369
T3 measured                    0
T3                           769
TT4 measured                   0
TT4                          231
T4U measured                   0
T4U                          387
FTI measured                   0
FTI                          385
TBG measured                   0
referral source                0
binaryClass                    0
dtype: int64

change column data type to numeric and replace Nan with mean

In [195]:
df['age']=  pd.to_numeric(df['age'], errors='coerce')
df['TSH'] = pd.to_numeric(df['TSH'], errors='coerce')
df['T3'] = pd.to_numeric(df['T3'], errors='coerce')
df['TT4'] = pd.to_numeric(df['TT4'], errors='coerce')
df['T4U'] = pd.to_numeric(df['T4U'], errors='coerce')
df['FTI'] = pd.to_numeric(df['FTI'], errors='coerce')

In [196]:
age_mean = df['age'].mean()
tsh_mean = df['TSH'].mean()
t3_mean = df['T3'].mean()
tt4_mean = df['TT4'].mean()
t4u_mean = df['T4U'].mean()
fti_mean = df['FTI'].mean()

In [197]:
df['age'].fillna(value=age_mean, inplace=True)
df['TSH'].fillna(value=tsh_mean, inplace=True)
df['T3'].fillna(value=t3_mean, inplace=True)
df['TT4'].fillna(value=tt4_mean, inplace=True)
df['T4U'].fillna(value=t4u_mean, inplace=True)
df['FTI'].fillna(value=fti_mean, inplace=True)

In [198]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,referral source,binaryClass
0,41.0,F,f,f,f,f,f,f,f,f,...,2.5,t,125.0,t,1.14,t,109.0,f,SVHC,P
1,23.0,F,f,f,f,f,f,f,f,f,...,2.0,t,102.0,f,0.995,f,110.469649,f,other,P
2,46.0,M,f,f,f,f,f,f,f,f,...,2.0135,t,109.0,t,0.91,t,120.0,f,other,P
3,70.0,F,t,f,f,f,f,f,f,f,...,1.9,t,175.0,f,0.995,f,110.469649,f,other,P
4,70.0,F,f,f,f,f,f,f,f,f,...,1.2,t,61.0,t,0.87,t,70.0,f,SVI,P


replacing categorical NULL values

In [199]:
# filling with most common class
smode = df.sex.mode()
smode[0]
df['sex'].fillna(value= smode[0], inplace=True)


In [200]:
#check to see if changes occured
df.iloc[16]

age                           42.0
sex                              F
on thyroxine                     f
query on thyroxine               f
on antithyroid medication        f
sick                             f
pregnant                         f
thyroid surgery                  f
I131 treatment                   f
query hypothyroid                f
query hyperthyroid               f
lithium                          f
goitre                           f
tumor                            f
hypopituitary                    f
psych                            f
TSH measured                     t
TSH                            1.2
T3 measured                      t
T3                             1.8
TT4 measured                     t
TT4                           70.0
T4U measured                     t
T4U                           0.86
FTI measured                     t
FTI                           81.0
TBG measured                     f
referral source              other
binaryClass         

In [201]:
df['binaryClass'].value_counts()

P    3481
N     291
Name: binaryClass, dtype: int64

We can see from the cell above that the dataset contains more positive records than negative

In [202]:
df.groupby('binaryClass').mean()

  df.groupby('binaryClass').mean()


Unnamed: 0_level_0,age,TSH,T3,TT4,T4U,FTI
binaryClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
N,51.515464,39.234072,1.559866,73.5237,1.013144,76.444815
P,51.754305,2.232165,2.051422,111.228145,0.993483,113.314011


Divide into features and target dataframes

In [203]:
X = df.drop(columns='binaryClass', axis=1)
Y = df['binaryClass']

In [204]:
X.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,referral source
0,41.0,F,f,f,f,f,f,f,f,f,...,t,2.5,t,125.0,t,1.14,t,109.0,f,SVHC
1,23.0,F,f,f,f,f,f,f,f,f,...,t,2.0,t,102.0,f,0.995,f,110.469649,f,other
2,46.0,M,f,f,f,f,f,f,f,f,...,f,2.0135,t,109.0,t,0.91,t,120.0,f,other
3,70.0,F,t,f,f,f,f,f,f,f,...,t,1.9,t,175.0,f,0.995,f,110.469649,f,other
4,70.0,F,f,f,f,f,f,f,f,f,...,t,1.2,t,61.0,t,0.87,t,70.0,f,SVI


In [205]:
Y.head()

0    P
1    P
2    P
3    P
4    P
Name: binaryClass, dtype: object

encoding the target column

In [206]:
# converting target to binary data
df_one = pd.get_dummies(Y)
print(df_one)

      N  P
0     0  1
1     0  1
2     0  1
3     0  1
4     0  1
...  .. ..
3767  0  1
3768  0  1
3769  0  1
3770  0  1
3771  0  1

[3772 rows x 2 columns]


In [207]:
#concatenate binary data 
df_two = pd.concat((df_one, Y), axis=1)
df_two.head()

Unnamed: 0,N,P,binaryClass
0,0,1,P
1,0,1,P
2,0,1,P
3,0,1,P
4,0,1,P


In [208]:
# class is dropped
df_two = df_two.drop(["binaryClass"], axis=1)
 
# We want P=1 and N =0 So we drop N column here
df_two = df_two.drop(["N"], axis=1)
 
# Rename the Column
Y = df_two.rename(columns={"P": "binaryClass"})
 
# Print the Result
print(Y)

      binaryClass
0               1
1               1
2               1
3               1
4               1
...           ...
3767            1
3768            1
3769            1
3770            1
3771            1

[3772 rows x 1 columns]


Standardize Features DataFrame

In [209]:
# Categorical boolean mask
categorical_feature_mask = X.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()
categorical_cols

['sex',
 'on thyroxine',
 'query on thyroxine',
 'on antithyroid medication',
 'sick',
 'pregnant',
 'thyroid surgery',
 'I131 treatment',
 'query hypothyroid',
 'query hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH measured',
 'T3 measured',
 'TT4 measured',
 'T4U measured',
 'FTI measured',
 'TBG measured',
 'referral source']

In [210]:
# import labelencoder to use on binary columns
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()

In [211]:
# apply le on categorical feature columns
binary_categorical_columns= categorical_cols[:-1]
X[binary_categorical_columns] = X[binary_categorical_columns].apply(lambda col: le.fit_transform(col))
X[binary_categorical_columns].head(10)

Unnamed: 0,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,...,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,0


In [212]:
#see how many different labels in referral source column
counts = X['referral source'].value_counts().sum()
counts

3772

In [213]:
top_10_labels = [y for y in X['referral source'].value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['other', 'SVI', 'SVHC', 'STMW', 'SVHD']

There are 5 unique labels

In [214]:
# import OneHotEncoder to process multicategorical column
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.compose import ColumnTransformer
### integer mapping using LabelEncoder
#label_encoder = LabelEncoder()
#integer_encoded = label_encoder.fit_transform(X['referral source'])
#print(integer_encoded)
#integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

### One hot encoding
#onehot_encoder = OneHotEncoder(sparse=False)
#X['referral source'] = onehot_encoder.fit_transform(integer_encoded)

#print(X['referral source'])

In [216]:
X['referral source'] = X['referral source'].map({'other':0, 'SVI':1, 'SVHC':2, 'STMW':3, 'SVHD':4})

In [219]:
scaler = StandardScaler()
scaler.fit(X)
standard_data = scaler.transform(X)
standard_data

array([[-0.53466511, -0.65895412, -0.37452106, ..., -0.04687735,
         0.        ,  1.61134808],
       [-1.43109584, -0.65895412, -0.37452106, ...,  0.        ,
         0.        , -0.70584786],
       [-0.28565657,  1.51755634, -0.37452106, ...,  0.30398943,
         0.        , -0.70584786],
       ...,
       [ 1.10879123, -0.65895412, -0.37452106, ..., -0.17446527,
         0.        , -0.70584786],
       [ 1.00918782,  1.51755634, -0.37452106, ..., -0.74861091,
         0.        ,  0.45275011],
       [ 0.61077416, -0.65895412, -0.37452106, ..., -0.58912601,
         0.        , -0.70584786]])

In [220]:
X = standard_data
print(X)

[[-0.53466511 -0.65895412 -0.37452106 ... -0.04687735  0.
   1.61134808]
 [-1.43109584 -0.65895412 -0.37452106 ...  0.          0.
  -0.70584786]
 [-0.28565657  1.51755634 -0.37452106 ...  0.30398943  0.
  -0.70584786]
 ...
 [ 1.10879123 -0.65895412 -0.37452106 ... -0.17446527  0.
  -0.70584786]
 [ 1.00918782  1.51755634 -0.37452106 ... -0.74861091  0.
   0.45275011]
 [ 0.61077416 -0.65895412 -0.37452106 ... -0.58912601  0.
  -0.70584786]]


**split the dataframe into training and testing for training the model**

In [221]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [222]:
print(X.shape, X_train.shape, X_test.shape)

(3772, 28) (3017, 28) (755, 28)


**Train the model**

In [223]:
classifier = svm.SVC(kernel= 'linear')

In [224]:
#training SVM classifier
classifier.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


**Evaluate the model**

In [226]:
#accuracy score on the training data
from sklearn.metrics import accuracy_score
X_train_prediction = classifier.predict(X_train)
training_accuracy = accuracy_score(X_train_prediction, Y_train)
training_accuracy

0.974477958236659

In [227]:
#accuracy score on the testing data
X_test_prediction = classifier.predict(X_test)
testing_accuracy = accuracy_score(X_test_prediction, Y_test)
testing_accuracy

0.976158940397351