In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

import acquire
import prepare

In [2]:
titanic_df = acquire.get_titanic_data()
train, test, validate = prepare.prep_titanic_data(titanic_df)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
301,1,3,30.189296,2,0,23.25,0,1,1,0
290,1,1,26.0,0,0,78.85,1,0,0,1
779,1,1,43.0,0,1,211.3375,0,0,0,1
356,1,1,22.0,0,1,55.0,0,0,0,1
147,0,3,9.0,2,2,34.375,0,0,0,1


In [3]:
#Creating X and y model-testing dataframes:
X_train = train.drop(columns = ['survived'])
y_train = train.survived

X_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

X_test = test.drop(columns = ['survived'])
y_test = test.survived

In [19]:
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:
    '''
    Lightweight script to test many models and find winners
    :param X_train: training split
    :param y_train: training target vector
    :param X_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    '''
    
    dfs = []
    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
        ]
    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    target_names = ['malignant', 'benign']
    for name, model in models:
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        final = pd.concat(dfs, ignore_index=True)
    return final

In [25]:
run_exps(X_train, y_train, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogReg
              precision    recall  f1-score   support

   malignant       0.80      0.89      0.84       132
      benign       0.78      0.65      0.71        82

    accuracy                           0.79       214
   macro avg       0.79      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214

RF
              precision    recall  f1-score   support

   malignant       0.85      0.89      0.87       132
      benign       0.80      0.74      0.77        82

    accuracy                           0.83       214
   macro avg       0.83      0.82      0.82       214
weighted avg       0.83      0.83      0.83       214

KNN
              precision    recall  f1-score   support

   malignant       0.78      0.79      0.78       132
      benign       0.65      0.65      0.65        82

    accuracy                           0.73       214
   macro avg       0.72      0.72      0.72       214
weighted avg       0.73      0.73      0.73       214

SVM
 

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted,test_roc_auc,model
0,0.028522,0.00658,0.78,0.796183,0.78,0.770642,0.850265,LogReg
1,0.015433,0.004219,0.83,0.831533,0.83,0.830678,0.879136,LogReg
2,0.014549,0.003617,0.82,0.831459,0.82,0.82275,0.867253,LogReg
3,0.011983,0.003132,0.818182,0.82246,0.818182,0.814574,0.860485,LogReg
4,0.011785,0.003122,0.777778,0.780207,0.777778,0.778672,0.817735,LogReg
5,0.074942,0.012431,0.82,0.824571,0.82,0.816671,0.900857,RF
6,0.077353,0.013072,0.79,0.78576,0.79,0.787102,0.84352,RF
7,0.07781,0.013027,0.74,0.772662,0.74,0.745804,0.846593,RF
8,0.075337,0.014307,0.848485,0.848154,0.848485,0.848231,0.893066,RF
9,0.080112,0.013403,0.787879,0.788945,0.787879,0.78833,0.837179,RF


In [23]:
final

NameError: name 'final' is not defined

In [22]:
bootstraps = []
for model in list(set(final.model.values)):
    model_df = final.loc[final.model == model]
    bootstrap = model_df.sample(n=30, replace=True)
    bootstraps.append(bootstrap)
        
bootstrap_df = pd.concat(bootstraps, ignore_index=True)
results_long = pd.melt(bootstrap_df,id_vars=['model'],var_name='metrics', value_name='values')
time_metrics = ['fit_time','score_time'] # fit time metrics
## PERFORMANCE METRICS
results_long_nofit = results_long.loc[~results_long['metrics'].isin(time_metrics)] # get df without fit data
results_long_nofit = results_long_nofit.sort_values(by='values')
## TIME METRICS
results_long_fit = results_long.loc[results_long['metrics'].isin(time_metrics)] # df with fit data
results_long_fit = results_long_fit.sort_values(by='values')

NameError: name 'final' is not defined