In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from imblearn.metrics import classification_report_imbalanced

In [None]:
data = pd.read_csv("data.csv")
labels = pd.read_csv("labels.csv")

In [None]:
data.head()

In [None]:
labels.head()

In [None]:
plt.figure( figsize= (5,5))
plt.pie((labels.disease_type.value_counts())/100 , labels=pd.unique(labels.disease_type), startangle
=180 , autopct='%1.1f%%' ,textprops={ 'fontsize': 10 , 'rotation':0}, shadow=True, radius=1.25)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(y=labels.disease_type, hue=labels.disease_type, palette='magma', dodge=False, legend=False)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Disease Type', fontsize=12)
plt.title('Distribution of Disease Types', fontsize=14)
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principal_components = pca.fit_transform(data.iloc[:, 1:])
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

final_df = pd.concat([principal_df, labels['disease_type']], axis=1)

plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='disease_type', data=final_df, palette='viridis')
plt.title('PCA Visualization')
plt.show()

# RANDOM FOREST

In [None]:
X = data.drop(['Unnamed: 0'], axis=1)
y = labels.disease_type

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

rf = RandomForestClassifier()

param_grid = {
     'n_estimators': [50, 100],
    'max_features': [7, 19, 21],
    'min_samples_split': [3, 7, 11],
    'max_depth': [3, 7, 11]
}

In [None]:
# StratifiedKFold kullanarak çapraz doğrulama stratejisini belirle
cv = StratifiedKFold(n_splits=3)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("En iyi parametreler:", grid_search.best_params_)
print("En iyi doğruluk:", grid_search.best_score_)

In [None]:
best_params_rf =  grid_search.best_params_

In [None]:
rf = RandomForestClassifier(max_depth=best_params_rf['max_depth'], 
                            max_features=best_params_rf['max_features'], 
                            min_samples_split=best_params_rf['min_samples_split'], 
                            n_estimators=best_params_rf['n_estimators']).fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cross_val_score(rf, X_test, y_test, cv=3).mean()

In [None]:
from imblearn.metrics import sensitivity_specificity_support
sensitivity_specificity_support(y_test, y_pred, average='micro', labels=pd.unique(labels.disease_type))

In [None]:
print(classification_report_imbalanced(y_test, y_pred, target_names=pd.unique(labels.disease_type)))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,7))
sns.heatmap(cm, annot=True, fmt="d", linewidths=0.7 ,cbar=False, xticklabels=pd.unique(labels.disease_type),yticklabels=pd.unique(labels.disease_type))
plt.show()

# XGBoost

# fix 

In [None]:
print("Gerçek Etiketler:", y_test)
print("Tahminler:", y_pred_single_class)

In [None]:
from xgboost import XGBClassifier

In [None]:
xgboost = XGBClassifier()

In [None]:
xgboost_params = {'learning_rate': [0.1, 0.01, 0.001],
            'subsample':[0.6, 0.8, 1],
            'n_estimators': [100,500],
            'max_depth':[3,5,7]}

In [None]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder oluşturun
label_encoder = LabelEncoder()

# Hastalık türlerini sayısal olarak kodlayın
y_train_encoded = label_encoder.fit_transform(labels['disease_type'])

# GridSearchCV ile modeli eğitin
xgboost_cv_model = GridSearchCV(xgboost, xgboost_params, cv=10, n_jobs=-1, verbose=2)
xgboost_cv_model.fit(X_train, y_train_encoded)

# Test verisi tahminlerini yapın
y_pred_encoded = xgboost_cv_model.predict(X_test)

In [None]:
xgboost_cv_model

In [None]:
best_params_xgb = xgboost_cv_model.best_params_

In [None]:
xgboost = XGBClassifier(
    learning_rate=best_params_xgb['learning_rate'], 
    max_depth=best_params_xgb['max_depth'], 
    n_estimators=best_params_xgb['n_estimators'], 
    subsample=best_params_xgb['subsample']).fit(X_train, y_train)

In [None]:
y_pred = xgboost.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cross_val_score(xgboost, X_test, y_test, cv=21).mean()

In [None]:
from imblearn.metrics import sensitivity_specificity_support
sensitivity_specificity_support(y_test, y_pred, average='micro', labels=pd.unique(labels.disease_type))

In [None]:
print(classification_report_imbalanced(y_test, y_pred, target_names=pd.unique(labels.disease_type)))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7,7))
sns.heatmap(cm, annot=True, fmt="d", linewidths=0.7 ,cbar=False, xticklabels=pd.unique(labels.disease_type),yticklabels=pd.unique(labels.disease_type))
plt.show()

# Conclusion

In [None]:
models = [rf, xgboost]
result = []
results = pd.DataFrame(columns=['Models', "Accuracy"])

for model in models:
    names = model.__class__.__name__
    print(names)
    if names == 'MLPClassifier':
        y_pred = model.predict(X_test_scaler)
    else:
        y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    result = pd.DataFrame([[names, acc*100]], columns=['Models', 'Accuracy'])
    results = results.append(result)

In [None]:
sns.barplot(x='Accuracy', y='Models', data=results, color='r')
plt.xlabel('Accuracy %')
plt.title('Modellerin Doğruluk Oranları');

In [None]:
results