# Start!

## Problem:
Mamy doczynienia z serią danych numerycznych wraz z problemem klasyfikacji binarnej.

Na podstawie opisów cech można wywnioskować, że są one w jakimś stopniu ze sobą powiązane - chociażby szerokość serca czy płuc z ich polem powierzchni.

### Moja początkowa intuicja:
1. Dokonać analizy PCA na cechach, by ograniczyć szum informacji
2. Zastosować klasyfikację knn z cross-examination na hiperparametrze k, by uniknąć under lub overfittingu



In [121]:
import numpy as np
import pandas as pd
from nltk.classify.svm import SvmClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


In [122]:
# Load DataFrame from CSV
df = pd.read_csv("task_data.csv", sep=",")

# Fix commas in floats (if necessary)
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].str.replace(",", ".")

# Convert columns except 'ID' and label to float
cols_to_float = [col for col in df.columns if col not in ['ID', 'Cardiomegaly']]
df[cols_to_float] = df[cols_to_float].astype(float)

# Encode target labels if necessary
le = LabelEncoder()
y = le.fit_transform(df["Cardiomegaly"])
X = df.drop(columns=["ID", "Cardiomegaly"]).values


In [123]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto')
X_resampled, y_resampled = smote.fit_resample(X, y)
print(y_resampled.shape)


(56,)




In [124]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_resampled)

# Zdecydowałem się nie używać PCA
X_reduced = X_standardized


In [125]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 50, 100, 200],
    'min_samples_split': [2, 5 ],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
}


X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_resampled)
rfClassifier = RandomForestClassifier()

grid_search = GridSearchCV(
    estimator=rfClassifier,
    param_grid=param_grid,
    cv=2,
    n_jobs=-1,
    scoring='roc_auc'
    )

grid_search.fit(X_train, y_train)

print(grid_search.predict([X_test[0]]))

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)
print("Accuracy:", grid_search.score(X_test, y_test))

rfClassifier = grid_search.best_estimator_


[0]
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Estimator: RandomForestClassifier(n_estimators=200)
Accuracy: 0.9333333333333333


In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
   "n_neighbors": range(1, 11),
    "leaf_size": range(30,200,5),
    "weights": ["uniform", "distance"]
}


X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_resampled)
knClassifier = KNeighborsClassifier()

grid_search = GridSearchCV(
    estimator=knClassifier,
    param_grid=param_grid,
    cv=2,
    n_jobs=-1,
    scoring='roc_auc'
    )

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)
print("Accuracy:", grid_search.score(X_test, y_test))

knClassifier = grid_search.best_estimator_



Best Parameters: {'leaf_size': 30, 'n_neighbors': 8, 'weights': 'distance'}
Best Estimator: KNeighborsClassifier(n_neighbors=8, weights='distance')
Accuracy: 1.0


In [127]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "solver": ["newton-cg", "lbfgs", "liblinear"],
}


X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_resampled)
logisticRegression = LogisticRegression(max_iter=10000)

grid_search = GridSearchCV(
    estimator=logisticRegression,
    param_grid=param_grid,
    cv=2,
    n_jobs=-1,
    scoring='roc_auc'
    )

grid_search.fit(X_train, y_train)

print(grid_search.predict([X_test[0]]), y_test[0])

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)
print("Accuracy:", grid_search.score(X_test, y_test))

logisticRegression = grid_search.best_estimator_



[0] 1
Best Parameters: {'solver': 'newton-cg'}
Best Estimator: LogisticRegression(max_iter=10000, solver='newton-cg')
Accuracy: 0.8541666666666667


In [128]:
import numpy as np

def predict(samples):
    results = []
    results.append([x[1] for x in logisticRegression.predict_proba(samples)])
    results.append(knClassifier.predict(samples))
    results.append(rfClassifier.predict(samples))

    # Korzystamy ze średniej tych modeli
    sum_of_results = np.sum(results, axis=0)/3

    sum_of_results = np.round(sum_of_results, decimals=0)
    return sum_of_results

In [136]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# As data is split randomly, for our last test let's split it again
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_resampled)

predictions = predict(X_test)
actual = y_test

accuracy = accuracy_score(actual, predictions)
precision = precision_score(actual, predictions)
recall = recall_score(actual, predictions)
f1 = f1_score(actual, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
