# Engenharia do Conhecimento 2023/2024

## Project: *Thyroid disease Data Set*

#### Group 6:

- Eduardo Proença 57551
- Tiago Oliveira 54979
- Bernardo Lopes 54386

### Summary

1. Data Processing
    1. Creating a Data Frame
    2. Data investigation
    3. Encoding Data
    4. Splitting into training and testing set
    5. Imputing missing values
    6. Scaling Data
    
2. Classification Models
    1. Feature Selection
    2. Model evaluation
    3. Decision Tree
    4. Logistic Regression
    5. Naive Bayes
    6. KNN
    7. SVM
3. Hyperparameter tuning

## 1. Data Processing

### 1.1 Creating a Data Frame

Firstly, we need to create a Data Frame. Using the [Pandas](https://pandas.pydata.org) Python Library, we can read our data from the file proj-data.csv, which contains the data set we will be using in this project.

In [None]:
import pandas as pd

# Load data set
df_thyroid = pd.read_csv('proj-data.csv')
df_thyroid.shape

In [None]:
df_thyroid.head()

### 1.2 Data investigation

In [None]:
for col in df_thyroid.columns:
    print("Values of ", end='')
    print(df_thyroid[col].value_counts(), end="\n\n")

In [None]:
df = df_thyroid.drop("[record identification]", axis=1)

In [None]:
import numpy as np

df.replace('?', np.nan, inplace=True)
df.isna().sum()

In [None]:
df_cleaned = (df.drop("TBG:", axis = 1)).dropna(subset=["T3:"])
df_cleaned.info()

### 1.3 Encoding Data

In [None]:
import numpy as np

encoded_values = {
    'M': '0', 'F': '1',
    'f': '0', 't': '1'
}
target = "diagnoses"
df_target = pd.DataFrame(df_cleaned["diagnoses"], columns=["diagnoses"])

encoded = df_cleaned.drop("diagnoses", axis=1).replace(encoded_values)
df_encoded = pd.get_dummies(encoded, columns=["referral source:"], dtype='int')

In [None]:
for index, value in df_target[target].items():
    if value == '-':
        df_target.at[index, target] = 0
    elif value == 'A' or value == 'B' or value == 'C' or value == 'D':
        df_target.at[index, target] = 1
    elif value == 'E' or value == 'F' or value == 'G' or value == 'H':
        df_target.at[index, target] = 2
    elif value == 'I' or value == 'J':
        df_target.at[index, target] = 3
    elif value == 'K':
        df_target.at[index, target] = 4
    elif value == 'L' or value == 'M' or value == 'N':
        df_target.at[index, target] = 5
    elif value == 'O' or value == 'P' or value == 'Q':
        df_target.at[index, target] = 6
    elif value == 'R' or value == 'S' or value == 'T':
        df_target.at[index, target] = 7
    else:
        df_target.at[index, target] = 8

df_target[target] = pd.to_numeric(df_target[target])
df_target[target].unique()

In [None]:
df = pd.concat([df_encoded, df_target], axis=1)
df.head()

### 1.4 Splitting into training and testing set

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("diagnoses", axis='columns')
y = df["diagnoses"]

X_TRAIN, X_IVS, y_TRAIN, y_IVS = train_test_split(X, y, test_size=0.2, random_state=123)

# Print the shapes of the training and testing sets
print("Training set shape:", X_TRAIN.shape, y_TRAIN.shape)
print("Testing set shape:", X_IVS.shape, y_IVS.shape)

### 1.5 Imputing missing values

In [None]:
print(f"Missing values in training set: {X_TRAIN.isna().sum().sum()}")
print(f"Missing values in testing set: {X_IVS.isna().sum().sum()}")

In [None]:
from sklearn.impute import KNNImputer

# Initialize KNNImputer with k=5 (you can adjust k as needed)
imputer = KNNImputer(n_neighbors=3)

# Perform KNN imputation
X_train_imp = imputer.fit_transform(X_TRAIN)
X_ivs_imp = imputer.transform(X_IVS)

# Convert the imputed array back to a DataFrame
train_nan = pd.DataFrame(X_train_imp, columns=X_TRAIN.columns).isna().sum().sum()
ivs_nan = pd.DataFrame(X_ivs_imp, columns=X_IVS.columns).isna().sum().sum()

print(f"Missing values in training set: {train_nan}")
print(f"Missing values in testing set: {ivs_nan}")

### 1.6 Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train_imp)

X_train_scl = scaler.transform(X_train_imp)
X_ivs_scl = scaler.transform(X_ivs_imp)

pd.DataFrame(X_train_scl, columns = X_TRAIN.columns).head()

## 2. Classification Models

In [None]:
X_TRAIN = X_train_scl
X_IVS = X_ivs_scl
y_TRAIN = y_TRAIN.to_numpy()
y_IVS = y_IVS.to_numpy()

### 2.1 Feature Selection

In [None]:
# TODO feature selection needs tuning
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

n_features = int(X_TRAIN.shape[1] * .4)
sfs = SequentialFeatureSelector(LinearRegression(), 
                                n_features_to_select=5, 
                                direction='forward', 
                                n_jobs=-1)
sfs.fit(X_TRAIN, y_TRAIN)

N, M = X_TRAIN.shape
features=sfs.get_support()
features_selected = np.arange(M)[features]
print("The features selected are columns: ", features_selected)

X_TRAIN = sfs.transform(X_TRAIN)

### 2.2 Model evaluation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

def evaluate(model):
    TRUTH = None
    PREDS = None
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    for train_index, test_index in kf.split(X_TRAIN):
        X_train, y_train = X_TRAIN[train_index], y_TRAIN[train_index]
        X_test, y_test = X_TRAIN[test_index], y_TRAIN[test_index]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        if TRUTH is None:
            PREDS = preds
            TRUTH = y_test
        else:
            PREDS = np.hstack((PREDS, preds))
            TRUTH = np.hstack((TRUTH, y_test))
    return TRUTH, PREDS
            
def print_statistics(truth, preds):
    print("The Accuracy is: %7.4f" % np.mean(accuracy_score(truth, preds)))
    print("The Precision is: %7.4f" % np.mean(precision_score(truth, preds, average='weighted', zero_division=1)))
    print("The Recall is: %7.4f" % np.mean(recall_score(truth, preds, average='weighted')))
    print("The F1 score is: %7.4f" % np.mean(f1_score(truth, preds, average='weighted')))
    print("The Matthews correlation coefficient is: %7.4f" % np.mean(matthews_corrcoef(truth, preds)))

### 2.3 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

TRUTH, PREDS = evaluate(DecisionTreeClassifier())
print_statistics(TRUTH, PREDS)

### 2.4 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

TRUTH, PREDS = evaluate(LogisticRegression())
print_statistics(TRUTH, PREDS)

### 2.5 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

TRUTH, PREDS = evaluate(GaussianNB())
print_statistics(TRUTH, PREDS)

### 2.6 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

TRUTH, PREDS = evaluate(KNeighborsClassifier())
print_statistics(TRUTH, PREDS)

### 2.7 SVM

In [None]:
# from sklearn.svm import SVC
# 
# TRUTH, PREDS = evaluate(SVC(kernel = "rbf", C = 1, gamma = 0.1))
# print_statistics(TRUTH, PREDS)

## 3. Hyperparameter tuning

In [None]:
scoring = ["accuracy", "precision", "recall", "f1"]

In [None]:
from sklearn.model_selection import GridSearchCV

depths = [3, 5, 10, 15]
m_sampl_split = [2, 5, 9]

prune_a = [0.0, 0.0001, 0.001, 0.01]
param_grid = {
    'max_depth': depths,
    'min_samples_split': m_sampl_split,
    'ccp_alpha': prune_a
}

tree = DecisionTreeClassifier(criterion='log_loss', random_state=23)
grid_search = GridSearchCV(tree, param_grid=param_grid, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_TRAIN, y_TRAIN)

print("Best hyperparameters: ", grid_search.best_params_)

In [None]:
# from sklearn.model_selection import GridSearchCV
# 
# param_grid = {
#     "n_neighbors": [3, 4, 5, 6, 7, 9],
#     "weights": ["uniform", "distance"]
# }
# 
# knn = KNeighborsClassifier()
# grid_search = GridSearchCV(knn, param_grid = param_grid, cv = 5, scoring = "f1")
# grid_search.fit(X_TRAIN, y_TRAIN)
# 
# print("Best hyperparameters: ", grid_search.best_params_)