In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")

import os
os.chdir('../')


In [54]:
import pickle
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterSampler, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, precision_score
import matplotlib.pyplot as plt


In [3]:
learners = [
    {
        "learner": XGBClassifier,
        "params": {
            "learning_rate": [0.1],
            "colsample_bytree": [0.15, 0.25, 0.5, 0.75, 1],
            "colsample_bylevel": [1],
            "max_depth": [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
            "gamma": [0, 0.01, 0.05, 0.1, 0.25, 0.5],
            "subsample": [0.5, 0.75, 1],
            "min_child_weight": [1, 5, 10, 15, 25, 50, 100],
            "base_score": [0.52],
            "n_estimators": [50, 100, 200, 300, 500, 1000] 
            }
    },
    {
        "learner": RandomForestClassifier,
        "params": {
            "max_depth": [3, 4, 5, 10, 25, None],
            "max_features": ["auto", 0.1, 0.25, 0.5, 0.75, 1],
            "min_weight_fraction_leaf": [0, 0.01, 0.001],
            "bootstrap": [True, False],
            "n_jobs": [-1],
            "n_estimators": [50, 100, 200, 300, 500, 1000]             
            }
    },
    {
        "learner": ExtraTreesClassifier,
        "params": {
            "max_depth": [None],
            "max_features": ["auto", 0.1],
            "min_weight_fraction_leaf": [0, 0.01, 0.001],
            "bootstrap": [True, False],
            "n_jobs": [-1],
            "n_estimators": [50, 100, 200, 300, 500, 1000]             
            }
    },
    {
        "learner": LogisticRegression,
        "params": {
            "penalty": ["l2"],
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 100],
            }
    },
]


In [4]:
train = pd.read_csv("./data/titanic/02 - preprocesada/train.csv")

In [27]:
X, X_validation, y, y_validation = train_test_split(train.drop(columns=["Survived"]), train.Survived, test_size=0.1, random_state=42)

In [28]:
skf = StratifiedKFold(n_splits=5, random_state=None)

In [93]:
if not os.path.exists("./clase4/results"):
    os.makedirs("./clase4/results")

if os.path.exists("./clase4/results/results.xlsx"):
    results = pd.read_excel(resultados_path)
    results["parameters"] = [eval(d) for d in results.parameters]
else:
    results = pd.DataFrame()


for x in range(0,100):
    candidate = np.random.choice(learners)
    learnerName = candidate["learner"].__name__
    params = list(ParameterSampler(candidate["params"], 1))[0]

    
    usedParams = results.filter(regex=learnerName , axis = 0)
    if len(usedParams) != 0 and np.any(usedParams.parameters == params):
        continue
    probs = pd.DataFrame()
    z = 0
    for train_index, test_index in skf.split(X,y): 
        X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        learner = candidate["learner"](**params)
        learner.fit(X_train, y_train)        
        aux = pd.DataFrame(X_test.PassengerId, columns = ["PassengerId"])  
        aux["probabilities"] = learner.predict_proba(X_test)[:,1]
        probs = probs.append(aux)
        indexName = "{}_{}".format(learnerName, str(x))
        if not os.path.exists("./clase4/results/{}".format(indexName)):
            os.makedirs("./clase4/results/{}".format(indexName))
        pickle.dump(learner, open("./clase4/results/{}/{}_{}.sav".format(indexName, indexName, str(z)), 'wb')) 
        z+=1
        
        
    probs.to_csv("./clase4/results/{}/probabilities.txt".format(indexName), index = False)
    areaUnderCurve = roc_auc_score(y.values, probs.probabilities.values)
    accuracy = accuracy_score(y.values, probs.probabilities.values > 0.5)
    recall = recall_score(y.values, probs.probabilities.values > 0.5)
    precision = precision_score(y.values, probs.probabilities.values > 0.5)
    row = [str(params), recall, precision, areaUnderCurve, accuracy]
    results = results.append(pd.DataFrame([row], columns=["parameters", "recall", "precision", "areaUnderCurve", "accuracy"],index=[indexName]))
    

results.to_excel("./clase4/results/results.xlsx")




ModuleNotFoundError: No module named 'openpyxl'

In [119]:
models = os.listdir("./clase4/results/")
X_validationNew = X_validation[["PassengerId"]]
for model in models:
    if model == "results.xlsx":
        continue
    model1 = pickle.load(open("./clase4/results/{}/{}_0.sav".format(model, model), 'rb'))
    model2 = pickle.load(open("./clase4/results/{}/{}_1.sav".format(model, model), 'rb'))
    model3 = pickle.load(open("./clase4/results/{}/{}_2.sav".format(model, model), 'rb'))
    model4 = pickle.load(open("./clase4/results/{}/{}_3.sav".format(model, model), 'rb'))
    model5 = pickle.load(open("./clase4/results/{}/{}_4.sav".format(model, model), 'rb'))
    aux = X_validation[["PassengerId"]]
    aux["score_0"] = model1.predict_proba(X_validation)[:,1]
    aux["score_1"] = model2.predict_proba(X_validation)[:,1]
    aux["score_2"] = model3.predict_proba(X_validation)[:,1]
    aux["score_3"] = model4.predict_proba(X_validation)[:,1]
    aux["score_4"] = model5.predict_proba(X_validation)[:,1]
    aux["probabilities_{}".format(model)] = (aux.score_1+aux.score_2+aux.score_3+aux.score_4+aux.score_0) / 5
    X_validationNew = pd.merge(X_validationNew, aux[["PassengerId", "probabilities_{}".format(model)]], 
                               how="inner",on="PassengerId")

    

In [108]:
models = os.listdir("./clase4/results/")
probabilities = pd.read_csv("./clase4/results/{}/probabilities.txt".format(models[0]))
probabilities = probabilities.rename(columns={"probabilities": "probabilities_{}".format(models[0])})

for model in models[1:]:
    if model == "results.xlsx":
        continue
    aux = pd.read_csv("./clase4/results/{}/probabilities.txt".format(model))
    aux = aux.rename(columns={"probabilities": "probabilities_{}".format(model)})
    probabilities = pd.merge(probabilities, aux, how="inner", on="PassengerId")
    

In [112]:
learner = XGBClassifier()
learner.fit(probabilities.drop(columns=["PassengerId"]), y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,titulo_Master,titulo_Miss,titulo_Mr,titulo_Mrs,titulo_Officer,titulo_Royalty,decilesEdad_0,decilesEdad_1,decilesEdad_2,decilesEdad_3,decilesEdad_4,decilesEdad_5,decilesEdad_6,decilesEdad_7,decilesEdad_8,decilesEdad_9,labelEncoderEmbarked
709,710,25.0,1,1,15.2458,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
439,440,31.0,0,0,10.5000,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,2
840,841,20.0,0,0,7.9250,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,2
720,721,6.0,0,1,33.0000,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2
39,40,14.0,1,0,11.2417,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,494,71.0,0,0,49.5042,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
215,216,31.0,1,0,113.2750,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
309,310,30.0,0,0,56.9292,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
822,823,38.0,0,0,0.0000,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2
