In [None]:
import csv
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model
from IPython.display import Image

In [None]:
input_files = ['allhypo.data', 'allhypo.test']
output_file = 'hypothyroid.csv'

data_rows = []

for filename in input_files:
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue

            parts = line.split(',')

            last_part = parts[-1]
            if '|' in last_part:
                last_field, _ = last_part.split('.|', 1)
                parts[-1] = last_field

            data_rows.append(parts)

headers = [
    'age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick',
    'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid',
    'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH', 'T3 measured',
    'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U', 'FTI measured', 'FTI', 'TBG measured',
    'TBG', 'referral source', 'binaryClass'
]

processed_rows = []
for row in data_rows:
    if len(row) != len(headers):
        print(f"Warning: Row length {len(row)} does not match header length {len(headers)}.")
        continue

    if row[-1] == 'negative':
        row[-1] = 'P'
    else:
        row[-1] = 'N'

    processed_rows.append(row)


with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(headers)
    writer.writerows(processed_rows)

print(f"Data merged and saved to '{output_file}'.")

In [None]:
df = pd.read_csv('./hypothyroid.csv')
df.head()

In [None]:
df.info()

In [None]:
df["binaryClass"] = df["binaryClass"].map({"P":0,"N":1})
df = df.replace({"t": 1, "f": 0})
df = df.replace({"F": 1, "M": 0})
df = df.replace({"?": np.NAN})

In [None]:
df.head()

In [None]:
df["referral source"].value_counts()
del df["referral source"]

In [None]:
cols = df.columns[df.dtypes.eq('object')]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['TBG measured', 'TBG'], axis=1)

In [None]:
imputer = SimpleImputer(strategy='mean')

df['T4U measured'] = imputer.fit_transform(df[['T4U measured']])
df['sex'] = imputer.fit_transform(df[['sex']])
df['age'] = imputer.fit_transform(df[['age']])
df['TSH'] = imputer.fit_transform(df[['TSH']])
df['T3'] = imputer.fit_transform(df[['T3']])
df['TT4'] = imputer.fit_transform(df[['TT4']])
df['T4U'] = imputer.fit_transform(df[['T4U']])
df['FTI'] = imputer.fit_transform(df[['FTI']])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
%matplotlib inline
sns.set(rc={'figure.figsize': [5, 5]}, font_scale=0.7)

In [None]:
sns.distplot(df['age'])

In [None]:
sns.distplot(df['sex'])

In [None]:
sns.distplot(df['T3'])

In [None]:
sns.distplot(df['TT4'])

In [None]:
sns.distplot(df['T4U'])

In [None]:
sns.distplot(df['FTI'])

In [None]:
sns.countplot(x='binaryClass', data=df, palette='rocket')

In [None]:
sns.countplot(x='binaryClass', data=df, hue='sex', palette='BuPu')

In [None]:
sns.boxplot(x='binaryClass', y='age', data=df)

In [None]:
df_corr = df.corr()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df_corr, cmap='viridis', linecolor='k', linewidths=2, annot=True)
plt.tight_layout()
plt.show()

In [None]:
print(df.head())

In [None]:
# anova

bins = [0, 20, 40, 60, 80, np.inf]
labels = ['0-19', '20-39', '40-59', '60-79', '80+']

df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

model = ols('TSH ~ C(age_group)', data=df).fit()

df = df.drop(['age_group'], axis=1)

anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

In [None]:
# pca

X = df.drop(['binaryClass'], axis=1).fillna(0)
y = df['binaryClass']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

component_names = [f'PC{i+1}' for i in range(pca.n_components_)]
df_pca = pd.DataFrame(X_pca, columns=component_names)

df_pca['binaryClass'] = y.reset_index(drop=True)

df_pca.head()

In [None]:
# k-means clustering

sns.set(style="whitegrid")

X = df_pca[['PC1', 'PC2', 'PC3']]

optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42)

df_pca['Cluster'] = kmeans.fit_predict(X)
print(df_pca.head())

y_true = df_pca['binaryClass']
y_pred = df_pca['Cluster']


fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
colors = df_pca['Cluster'].map({0: 'red', 1: 'blue'})
ax.scatter(df_pca['PC1'], df_pca['PC2'], df_pca['PC3'], c=colors, alpha=0.6)
ax.set_title(f'K-Means Clustering with k={optimal_k}')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

In [None]:
x = df.drop(['binaryClass'], axis=1)
y = df['binaryClass']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
x_ols_train = sm.add_constant(x_train)
x_ols_test = sm.add_constant(x_test)

In [None]:
model = sm.OLS(y_train, x_ols_train).fit()

In [None]:
predicted_probs = model.predict(x_ols_test)
predicted_classes = (predicted_probs > 0.5).astype(int)

accuracy = accuracy_score(y_test, predicted_classes)
precision = precision_score(y_test, predicted_classes)
recall = recall_score(y_test, predicted_classes)
f1 = f1_score(y_test, predicted_classes)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

conf_matrix = confusion_matrix(y_test, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
x.shape[1]

In [None]:
model = Sequential()
model.add(Dense(256, input_shape=[x.shape[1]], activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
plot_model(model, to_file='convnet.png', show_shapes=True,show_layer_names=True)
Image(filename='convnet.png')

In [None]:
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
lrd = ReduceLROnPlateau(monitor = 'val_loss',
                         patience = 20,
                         verbose = 1,
                         factor = 0.75,
                         min_lr = 1e-10)
mcp = ModelCheckpoint('model.keras')
es = EarlyStopping(verbose=1, patience=20)

In [None]:
history = model.fit(x=x_train, y=y_train, epochs=100, callbacks=[lrd, mcp, es], batch_size=64, validation_split=0.1)

In [None]:
model.evaluate(x_test, y_test)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
y_pred[1]

In [None]:
y_test

In [None]:
y_pred[50]

In [None]:
y_test.iloc[50]

In [None]:
model.predict(x_test)[70]

In [None]:
y_test.iloc[70]

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
model.save('model.keras')

In [None]:
predicted_classes = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, predicted_classes)
precision = precision_score(y_test, predicted_classes)
recall = recall_score(y_test, predicted_classes)
f1 = f1_score(y_test, predicted_classes)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

conf_matrix = confusion_matrix(y_test, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
models = ['Model 1', 'Model 2', 'Model 3']

accuracies = [0.85, 0.98, 0.78]

plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, width=0.6)

plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Accuracy of Different Models', fontsize=14)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

for i, acc in enumerate(accuracies):
    plt.text(i, acc + 0.02, f"{acc:.2f}", ha='center', fontsize=10)

plt.tight_layout()
plt.show()