In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

def load_data(csv_file_path):
    return pd.read_csv(csv_file_path)

def drop_zero_columns(df):
    df_no_missing = df.dropna(axis = 1)
    numeric_columns = df_no_missing.select_dtypes(include=np.number)
    columns_with_all_zeros = (numeric_columns==0).all(axis = 0)
    
    return columns_with_all_zeros[columns_with_all_zeros].index, df.drop(columns = columns_with_all_zeros[columns_with_all_zeros].index)

def remove_rows_with_missing_values_in_categorical(df):
    categorical_columns = df.select_dtypes(include = ['object', 'category', 'bool']).columns
    return df.dropna(subset = categorical_columns)

def split_set_into_train_test(df, target_variable, test_size = 0.2):
    X = df.drop(target_variable, axis = 1)
    y = df[target_variable]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42, stratify = y)

    return X_train, X_test, y_train, y_test

def imputation_of_missing_values(X_train, X_test, neighbors = 5):
    numerical_variables = X_train.select_dtypes(include = np.number).columns
    categorical_variables = X_train.select_dtypes(include = ['object', 'category', 'bool']).columns
    
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

    preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_variables),
        ('cat', categorical_transformer, categorical_variables)
    ])

    X_train_preprocessed = preprocessor.fit_transform(X_train)

    knn_imputer = KNNImputer(n_neighbors = neighbors)
    X_train  = knn_imputer.fit_transform(X_train_preprocessed)

    X_test_preprocessed = preprocessor.transform(X_test)
    X_test = knn_imputer.transform(X_test_preprocessed)
    
    return X_train, X_test, preprocessor, knn_imputer

def feature_selection(model, X_train, y_train, X_test, y_test, n_features_to_select=10):

    model.set_params(scale_pos_weight= ((y_train.shape[0] - np.sum(y_train))/np.sum(y_train)))
    
    rfe = RFE(model, n_features_to_select=n_features_to_select)
    X_train_selected = rfe.fit_transform(X_train, y_train)
    X_test_selected = rfe.transform(X_test)

    return rfe.support_, X_train_selected, X_test_selected

def ensemble(model1, model2, model3, model4, voting = 'soft', weights = [2,2,1,1]):
    ensemble_clf = VotingClassifier(estimators=[
        (model1[0], model1[1]),
        (model2[0], model2[1]),
        (model3[0], model3[1]),
        (model4[0], model4[1]),
    ], voting='soft', weights = [2, 2, 1, 1])

    return ensemble_clf

def trained_model(X, y, model1, model2, model3, model4, voting = 'soft', weights = [2,2,1,1]):
    ensemble_clf = ensemble(model1, model2, model3, model4, voting = 'soft', weights = [2,2,1,1])
    ensemble_clf.fit(X, y)

    return ensemble_clf

def brier_score(X, y, ensemble):
    y_pred_prob = ensemble.predict_proba(X)
    return brier_score_loss(y, y_pred_prob[:, 1])

In [2]:
#Train and test model on dataset

#load data_set
df = load_data('/Users/danielmilanesperez/Documents/Projects/Baubap/training.csv')

#preprocessing
columns, df = drop_zero_columns(df)
df = remove_rows_with_missing_values_in_categorical(df)
X_train, X_test, y_train, y_test = split_set_into_train_test(df, 'Target')
X_train, X_test, preprocesor, knn_imputer = imputation_of_missing_values(X_train, X_test, neighbors = 5)

#feature_selection
model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss',          
    scale_pos_weight = 1
)

support, X_train_selected, X_test_selected = feature_selection(model, X_train, y_train, X_test, y_test, n_features_to_select=25)

#fitting ensemble
logreg = ['logreg', LogisticRegression(C = 0.01, penalty = 'l2', random_state=42)]
nn = ['nn', MLPClassifier(hidden_layer_sizes=(2, 1), max_iter=1000, alpha = 0.001, random_state=42)]
xgb1 = ['xgb1', xgb.XGBClassifier(colsample_bytree =1.0, learning_rate= 0.1, max_depth= 3, n_estimators =50, subsample = 0.9, random_state=42)]
knn = ['knn', KNeighborsClassifier(n_neighbors = 11, p = 2, weights ='distance')]

#fiting ensemble
ensemble_clf = trained_model(X_train_selected, y_train, logreg, nn, xgb1, knn, voting = 'soft', weights = [2,2,1,1])

#testing dataset
score = brier_score(X_test_selected, y_test, ensemble_clf)

print(score)


0.09248407868959824


In [10]:
#load data_set
validation = load_data('/Users/danielmilanesperez/Downloads/data_evaluation.csv')

#preprocessing
validation.drop(columns = columns)
X_preprocesed = preprocesor.transform(validation)
X = knn_imputer.transform(X_preprocesed)
X_final = X[:, support]

X_pred = ensemble_clf.predict_proba(X_final)
print(X_pred[:, 1])

np.savetxt('/Users/danielmilanesperez/Downloads/data_evaluation_how-u-doing.csv', X_pred[:,1], delimiter=',')

[0.0447652  0.04768859 0.05099318 0.12977546 0.10097523 0.03918252
 0.10673232 0.25217856 0.13848109 0.09456977 0.13841767 0.15728146
 0.06668847 0.07906028 0.12341708 0.04491834 0.12486109 0.07655304
 0.08007658 0.10564003 0.13405514 0.139841   0.19393361 0.08571837
 0.19400155 0.09373728 0.11977764 0.15766235 0.16346676 0.2111208
 0.04616805 0.25064857 0.1882136  0.103408   0.17584503 0.07039037
 0.13124767 0.24033026 0.18781955 0.25256605 0.20922111 0.15609227
 0.09272003 0.09982888 0.13035699 0.08749442 0.10921712 0.16813727
 0.14145855 0.23168571 0.16460631 0.15398552 0.13951911 0.076558
 0.07016939 0.08650674 0.11910687 0.16536631 0.10378177 0.10617403
 0.40628396 0.08801493 0.10094703 0.19316488 0.14757991 0.12855651
 0.0830367  0.08115062 0.09537716 0.06052994 0.09124032 0.11408032
 0.10126475 0.28150649 0.05715936 0.08246008 0.21086741 0.11561818
 0.0885278  0.09043764 0.28241146 0.20690271 0.24448686 0.09003643
 0.12885218 0.19256578 0.06128373 0.09446117 0.40815211 0.3300974