# Kamień milowy 3(Finalne modele)

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
congressional_voting_df = pd.read_csv("congressional_voting_dataset.csv")

In [3]:
# zmieniamy kodowanie zmiennych
# teraz traktujemy "?" jako wsztrzymania się od głosu, a nie brak danych
map = {"y": 1, "n" : -1, "?": 0, "democrat" : 1, "republican" : 0}

columns = congressional_voting_df.columns.to_list()
for column in columns:
    congressional_voting_df[column] = congressional_voting_df[column].map(map)

congressional_voting_df.head(5)

Unnamed: 0,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa,political_party
0,-1,1,-1,1,1,1,-1,-1,-1,1,0,1,1,1,-1,1,0
1,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,0,0
2,0,1,1,0,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1,1
3,-1,1,1,-1,0,1,-1,-1,-1,-1,1,-1,1,-1,-1,1,1
4,1,1,1,-1,1,1,-1,-1,-1,-1,1,0,1,1,1,1,1


## Train-test split

In [4]:
# z kamienia 2 pamiętamy, że target jest zrównoważony
X_train, X_test, y_train, y_test = train_test_split(congressional_voting_df.drop("political_party", axis=1), 
                                                    congressional_voting_df["political_party"], 
                                                    test_size=0.3, shuffle=True, random_state=42)

## Modeling(all variables)

In [43]:
models_dict = {
    "rfc": 0,
    "xbg": 1, # eXtreme gradient boosting
    "abc": 2 # adaptive boost
}

params = [
    {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
]

models = [RandomForestClassifier(random_state=42)
         ]

In [38]:
def get_scores(clf, X_test, y_test):
    return pd.DataFrame({"accuracy": [clf.score(X_test, y_test)], 
                             "auc": [roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])]})

In [47]:
def modeling(X_train, y_train, X_test, y_test, model_type):
    if model_type == "baseline":
        lr = LogisticRegression(random_state=0)
        lr.fit(X_train, y_train)
        return (lr, get_scores(lr, X_test, y_test))
    
    clf = models[models_dict[model_type]]
    parameter_set = params[models_dict[model_type]]
    
    random_search = RandomizedSearchCV(clf, parameter_set, random_state=0, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    clf = random_search.best_estimator_
    clf.fit(X_train, y_train)
    
    return (clf, get_scores(clf, X_test, y_test))

### Baseline

In [40]:
baseline, baseline_scores = modeling(X_train, y_train, X_test, y_test, "baseline")
baseline_scores

Unnamed: 0,accuracy,auc
0,0.984733,0.996164


### Random forest

In [46]:
rfc, rfc_scores = modeling(X_train, y_train, X_test, y_test, "rfc")
rfc_scores # gorzej niż baseline...

Unnamed: 0,accuracy,auc
0,0.977099,0.995908
