# Kamień milowy 3 (Finalne modele)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
congressional_voting_df = pd.read_csv("congressional_voting_dataset.csv")

In [None]:
# zmieniamy kodowanie zmiennych
# teraz traktujemy "?" jako wsztrzymania się od głosu, a nie brak danych
map = {"y": 1, "n" : -1, "?": 0, "democrat" : 1, "republican" : 0}
# map = {"y": 2, "n" : 1, "?": 0, "democrat" : 1, "republican" : 0} # przy takim kodowaniu wyniki są gorsze

columns = congressional_voting_df.columns.to_list()
for column in columns:
    congressional_voting_df[column] = congressional_voting_df[column].map(map)

congressional_voting_df.head(5)

## Train-test split

In [None]:
# z kamienia 2 pamiętamy, że target jest zrównoważony
X_train, X_test, y_train, y_test = train_test_split(congressional_voting_df.drop("political_party", axis=1), 
                                                    congressional_voting_df["political_party"], 
                                                    test_size=0.3, shuffle=True, random_state=42)

## Modeling(all variables)

In [None]:
models_dict = {
    "rfc": 0,
    "xgb": 1, # eXtreme gradient boosting
    "gbc": 2, # gradient boosting
    "lr": 3
}

params = [
    {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    },
    
    {
        'max_depth': [6, 10, 15, 20],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'n_estimators': [100]},
    
    {
        'n_estimators': [100, 300, 500, 700, 900, 1200, 1500, 1700, 2000],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
        'max_depth': [4, 6, 8, 10]
        
    },
    
    {
        'penalty' : ['l1', 'l2'],
        'C' : np.logspace(-4, 4, 20),
        'solver' : ['liblinear', 'saga']}
]

models = [RandomForestClassifier(random_state=42),
          XGBClassifier(use_label_encoder=False),
          GradientBoostingClassifier(random_state=0),
          LogisticRegression(random_state=0)
         ]

res = []

In [None]:
def get_scores(clf, X_test, y_test, model_type):
    return pd.DataFrame({
                            "clf": [model_type],
                            "accuracy": [clf.score(X_test, y_test)], 
                            "auc": [roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])]
                        })

In [None]:
def modeling(X_train, y_train, X_test, y_test, model_type):
    if model_type == "baseline":
        lr = LogisticRegression(random_state=0)
        lr.fit(X_train, y_train)
        return (lr, get_scores(lr, X_test, y_test, model_type))
    
    clf = models[models_dict[model_type]]
    parameter_set = params[models_dict[model_type]]
    
    random_search = RandomizedSearchCV(clf, parameter_set, random_state=0, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    clf = random_search.best_estimator_
    clf.fit(X_train, y_train)
    
    return (clf, get_scores(clf, X_test, y_test, model_type))

In [None]:
def merge_res_to_df(res):
    res_merged = None

    for i in range(len(res)):
        res_merged = pd.concat([res_merged, res[i]], axis=0).reset_index(drop=True)
        
    return res_merged

### Baseline

In [None]:
baseline, baseline_scores = modeling(X_train, y_train, X_test, y_test, "baseline")
res.append(baseline_scores)
baseline_scores

### Random forest

In [None]:
rfc, rfc_scores = modeling(X_train, y_train, X_test, y_test, "rfc")
res.append(rfc_scores)
rfc_scores # gorzej niż baseline...

### XGBoost

In [None]:
xgb, xgb_scores = modeling(X_train, y_train, X_test, y_test, "xgb")
res.append(xgb_scores)
xgb_scores

### Gradient boosting

In [None]:
gbc, gbc_scores = modeling(X_train, y_train, X_test, y_test, "gbc")
res.append(gbc_scores)
gbc_scores

### Logistic Regression

In [None]:
lr, lr_scores = modeling(X_train, y_train, X_test, y_test, "lr")
res.append(lr_scores)
lr_scores

## Conclusion

### All variables
Ogólnie lepiej niż 0.98... się nie daje. W porównaniu do kamienia 2 wyniki się nieco polepszyły, prawdopodobnie z powodu zmiany encodingu(i zrezygnowania z imputacji `?`).

In [None]:
merge_res_to_df(res)