In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('./data/train_prep.csv')
test = pd.read_csv('./data/test_prep.csv')

In [4]:
X_train = train.drop('target',axis=1)
y_train = train['target']

In [8]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [6, 9, 12, 15],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'class_weight': ['balanced_subsample', {0: 1, 1: 7}, {0: 1, 1: 10}, {0: 1, 1: 12},{0: 1, 1: 14}]
}
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score, average='binary')  
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1, cv=stratified_kfold, verbose=3)

grid_search.fit(X_train, y_train)


print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
[CV 1/5] END class_weight=balanced_subsample, max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.163 total time=   1.4s
[CV 2/5] END class_weight=balanced_subsample, max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.164 total time=   1.6s
[CV 3/5] END class_weight=balanced_subsample, max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.160 total time=   1.7s
[CV 4/5] END class_weight=balanced_subsample, max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.166 total time=   1.6s
[CV 5/5] END class_weight=balanced_subsample, max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.177 total time=   1.4s
[CV 1/5] END class_weight=balanced_subsample, max_depth=6, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.164 total time=   3.4s
[CV 2/5] END class_weight=balanced_subsam

In [9]:

print("Best f1 score: ", grid_search.best_score_)

Best f1 score:  0.18380275088625825


In [9]:

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, make_scorer

In [8]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': np.arange(1, 21)  
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score, average='binary')
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=f1, cv=stratified_kfold, verbose=3)

grid_search.fit(X_train, y_train)


print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_
print("Best f1 score: ", grid_search.best_score_)
# 이상치 제거해주는게 더효과적일수도

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ................knn__n_neighbors=1;, score=0.173 total time=   2.3s
[CV 2/5] END ................knn__n_neighbors=1;, score=0.186 total time=   1.8s
[CV 3/5] END ................knn__n_neighbors=1;, score=0.192 total time=   1.7s
[CV 4/5] END ................knn__n_neighbors=1;, score=0.177 total time=   1.7s
[CV 5/5] END ................knn__n_neighbors=1;, score=0.165 total time=   1.5s
[CV 1/5] END ................knn__n_neighbors=2;, score=0.131 total time=   1.5s
[CV 2/5] END ................knn__n_neighbors=2;, score=0.129 total time=   1.6s
[CV 3/5] END ................knn__n_neighbors=2;, score=0.138 total time=   1.6s
[CV 4/5] END ................knn__n_neighbors=2;, score=0.135 total time=   1.5s
[CV 5/5] END ................knn__n_neighbors=2;, score=0.116 total time=   1.7s
[CV 1/5] END ................knn__n_neighbors=3;, score=0.150 total time=   1.6s
[CV 2/5] END ................knn__n_neighbors=3

In [15]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis())
])

param_grid = {
    'lda__n_components': np.arange(1, min(X_train.shape[1], len(np.unique(y_train))))  
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score, average='binary')
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=f1, cv=stratified_kfold, verbose=3)

grid_search.fit(X_train, y_train)


print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_
print("Best f1 score: ", grid_search.best_score_)
# 얘도 이상치 제거해주는게 나을수도

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ...............lda__n_components=1;, score=0.105 total time=   0.9s
[CV 2/5] END ...............lda__n_components=1;, score=0.091 total time=   0.9s
[CV 3/5] END ...............lda__n_components=1;, score=0.133 total time=   0.9s
[CV 4/5] END ...............lda__n_components=1;, score=0.115 total time=   0.8s
[CV 5/5] END ...............lda__n_components=1;, score=0.082 total time=   0.9s
Best parameters found:  {'lda__n_components': 1}
Best f1 score:  0.10499522383639408
