In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
%matplotlib inline

# Helper Functions

In [None]:
def parse_row(row):
    row = row.split(".")[0]
    return row

def to_hyperthyroid(row):
    if row != "negative":
        row = "hyperthyroid"
    return row

def to_hypothyroid(row):
    if row != "negative":
        row = "hypothyroid"
    return row

def convert_category(dataframe, column):
    conditionF1 = dataframe[column] == 'f'
    conditionF2 = dataframe[column] == 'F' # For sex column
    
    conditionT1 = dataframe[column] == 't'
    conditionT2 = dataframe[column] == 'M' # For sex column

    dataframe.loc[conditionF1, column] = 0
    dataframe.loc[conditionF2, column] = 0
    
    dataframe.loc[conditionT1, column] = 1
    dataframe.loc[conditionT2, column] = 1

# Data Retrieval  

In [None]:
columns = ["Age", "Sex", "On Thyroxine", "Query on Thyroxine", 
           "On Antithyroid Medication", "Sick", "Pregnant", 
           "Thyroid Surgery", "I131 Treatment", "Query Hypothyroid", 
           "Query Hyperthyroid", "Lithium", "Goitre", "Tumor", 
           "Hypopituitary", "Psych", "TSH Measured", "TSH", "T3 Measured", 
           "T3", "TT4 Measured", "TT4", "T4U Measured", "T4U", 
           "FTI Measured", "FTI", "TBG Measured", "TBG", "Referral Source", "Category"]

In [None]:
hyper_data = pd.read_csv("../Datasets/allhyper.csv", names=columns)
hypo_data = pd.read_csv("../Datasets/allhypo.csv", names=columns)
sick_data = pd.read_csv("../Datasets/sick.csv", names=columns)

In [None]:
hyper_data['Category'] = hyper_data['Category'] \
                            .apply(parse_row) \
                            .apply(to_hyperthyroid)

hypo_data['Category'] = hypo_data['Category'] \
                            .apply(parse_row) \
                            .apply(to_hypothyroid)

sick_data['Category'] = sick_data['Category'] \
                            .apply(parse_row)

In [None]:
# Combined Thyroid Data

thyroid_frames = [hyper_data, hypo_data, sick_data]
thyroid_data = pd.concat(thyroid_frames) \
                 .drop_duplicates() \
                 .drop(['Referral Source', 'TBG', 'TBG Measured'], axis=1)

In [None]:
thyroid_data.head()

# Data Cleaning

In [None]:
# A quick fix needed
thyroid_data.loc[thyroid_data['Age'] == '455', 'Age'] = '45'

In [None]:
# Binarize Category Columns
binary_cols = ['On Thyroxine', 'Query on Thyroxine', 'Sex',
               'On Antithyroid Medication', 'Sick', 'Pregnant', 
               'Thyroid Surgery', 'I131 Treatment', 'Query Hypothyroid', 
               'Query Hyperthyroid', 'Lithium', 'Goitre', 'Tumor', 
               'Hypopituitary', 'Psych', 'TSH Measured', 'T3 Measured', 
               'TT4 Measured', 'T4U Measured', 'FTI Measured']

for col in binary_cols: convert_category(thyroid_data, col)

In [None]:
# Convert '?' to np.nan and convert numeric data to numeric dtype
for col in thyroid_data.columns: 
    if col != 'Category':
        thyroid_data.loc[thyroid_data[col] == '?', col] = np.nan
        thyroid_data[col] = pd.to_numeric(thyroid_data[col])

In [None]:
curr_columns = thyroid_data.columns.difference(['Category'])

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputed_data = imputer.fit_transform(thyroid_data.drop('Category', axis=1))
imputed_data = pd.DataFrame(imputed_data, columns=curr_columns)

thyroid_data = pd.concat([
                    imputed_data.reset_index(), 
                    thyroid_data['Category'].reset_index()], 
                    axis=1).drop('index', axis=1)

In [None]:
thyroid_data.head()

In [None]:
thyroid_data['Age'].hist()

# Load Data
----

In [None]:
def get_data(balanced=False):
    if balanced:
        return pd.read_csv("thyroid_balanced.csv",index_col=False)
    else:
        return pd.read_csv("thyroid_unbalanced.csv",index_col=False)

# Prepare Data For Model
------

In [None]:
thyroid_data = get_data(balanced=True)

In [None]:
thyroid_data.head()

In [None]:
targets = thyroid_data['Category'].unique()

In [None]:
X = thyroid_data.drop("Category", axis=1)
y = thyroid_data["Category"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
# kf = StratifiedKFold(n_splits=3)

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Multiclass Classification with Support Vector Machines
-------

In [None]:
svm_model = LinearSVC()

In [None]:
svm_model.fit(X_train,y_train)

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
print("Accuracy:\n", metrics.accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred))

In [None]:
unique, counts = np.unique(y_test, return_counts=True)

In [None]:
unique, counts

In [None]:
precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))