<a href="https://colab.research.google.com/github/dima1115/Machine-learning/blob/main/%D0%94%D0%B0%D1%80%D0%BC%D0%BE%D1%81%D1%82%D1%83%D0%BA_%D0%BF%D1%804_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Лабораторна робота 4
Тема: Задача класифікації
Автор: Дармостук Дмитро, група ФІТ-4-9
Варіант: 6

In [1]:

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
np.random.seed(6)


## Завдання 1. Breast Cancer

In [2]:

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6, stratify=y)

pipelines = {
    'LogReg': Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=500))]),
    'Tree':   Pipeline([('clf', DecisionTreeClassifier(random_state=6))]),
    'RF':     Pipeline([('clf', RandomForestClassifier(random_state=6))])
}
grids = {
    'LogReg': {'clf__C':[0.1,1.0,10.0], 'clf__solver':['lbfgs']},
    'Tree':   {'clf__max_depth':[None,5,10], 'clf__min_samples_split':[2,5,10]},
    'RF':     {'clf__n_estimators':[100,200], 'clf__max_depth':[None,10], 'clf__min_samples_split':[2,5]}
}

res = {}; best = {}
for n,p in pipelines.items():
    grid = GridSearchCV(p, grids[n], cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best[n] = grid.best_estimator_
    pred = best[n].predict(X_test)
    res[n] = accuracy_score(y_test, pred)
print('Accuracy table:\n', pd.Series(res).sort_values(ascending=False))
for n,m in best.items():
    pred = m.predict(X_test)
    print(f"\n== {n} ==")
    print('Accuracy:', accuracy_score(y_test, pred))
    print('Confusion:\n', confusion_matrix(y_test, pred))
    print('Report:\n', classification_report(y_test, pred))


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Accuracy table:
 LogReg    0.991228
RF        0.973684
Tree      0.964912
dtype: float64

== LogReg ==
Accuracy: 0.9912280701754386
Confusion:
 [[41  1]
 [ 0 72]]
Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.99      1.00      0.99        72

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114


== Tree ==
Accuracy: 0.9649122807017544
Confusion:
 [[41  1]
 [ 3 69]]
Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95        42
           1       0.99      0.96      0.97        72

    accuracy                           0.96       114
   macro avg       0.9

## Завдання 2. Titanic

In [3]:

import pandas as pd
from pathlib import Path

def load_titanic():
    p = Path('/content/titanic.csv')
    if p.exists():
        return pd.read_csv(p)
    try:
        import seaborn as sns
        return sns.load_dataset('titanic')
    except Exception:
        pass
    url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
    return pd.read_csv(url)

titanic = load_titanic()
titanic = titanic.rename(columns={'survived':'Survived','pclass':'Pclass','sex':'Sex','age':'Age','fare':'Fare'})
cols = [c for c in ['Survived','Pclass','Sex','Age','Fare','sibsp','parch','SibSp','Parch','embarked','Embarked'] if c in titanic.columns]
df_t = titanic[cols].copy()
if 'SibSp' in df_t.columns and 'sibsp' not in df_t.columns: df_t = df_t.rename(columns={'SibSp':'sibsp'})
if 'Parch' in df_t.columns and 'parch' not in df_t.columns: df_t = df_t.rename(columns={'Parch':'parch'})
if 'Embarked' in df_t.columns and 'embarked' not in df_t.columns: df_t = df_t.rename(columns={'Embarked':'embarked'})
df_t['Sex'] = df_t['Sex'].map({'male':0,'female':1,'Male':0,'Female':1})
for c in ['Age','Fare','sibsp','parch']:
    if c in df_t.columns: df_t[c] = df_t[c].fillna(df_t[c].median())
if 'embarked' in df_t.columns:
    df_t['embarked'] = df_t['embarked'].fillna(df_t['embarked'].mode().iloc[0])
    df_t = pd.get_dummies(df_t, columns=['embarked'], drop_first=True)
df_t = df_t.dropna(axis=0)

from sklearn.model_selection import train_test_split
X2 = df_t.drop(columns=['Survived']); y2 = df_t['Survived'].astype(int)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=6, stratify=y2)

pipes = {
    'LogReg': Pipeline([('scaler', StandardScaler(with_mean=False)), ('clf', LogisticRegression(max_iter=500))]),
    'Tree':   Pipeline([('clf', DecisionTreeClassifier(random_state=6))]),
    'RF':     Pipeline([('clf', RandomForestClassifier(random_state=6))])
}
grids = {
    'LogReg': {'clf__C':[0.1,1.0,10.0], 'clf__solver':['lbfgs']},
    'Tree':   {'clf__max_depth':[None,5,10], 'clf__min_samples_split':[2,5,10]},
    'RF':     {'clf__n_estimators':[100,200], 'clf__max_depth':[None,10], 'clf__min_samples_split':[2,5]}
}
res2 = {}; best2 = {}
for n,p in pipes.items():
    grid = GridSearchCV(p, grids[n], cv=5, n_jobs=-1, verbose=1)
    grid.fit(X2_train, y2_train)
    best2[n] = grid.best_estimator_
    pr = best2[n].predict(X2_test)
    res2[n] = accuracy_score(y2_test, pr)
print('Accuracy table (Titanic):\n', pd.Series(res2).sort_values(ascending=False))

best2_name = max(res2, key=res2.get)
print('Best model for Titanic:', best2_name)
pr10 = best2[best2_name].predict(X2_test[:10])
print(pd.DataFrame({'y_true': y2_test.iloc[:10].values, 'y_pred': pr10}))


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Accuracy table (Titanic):
 Tree      0.826816
RF        0.821229
LogReg    0.793296
dtype: float64
Best model for Titanic: Tree
   y_true  y_pred
0       0       0
1       0       0
2       0       0
3       1       1
4       0       0
5       1       1
6       1       1
7       0       1
8       0       0
9       1       0
