# Librerias

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler
from collections import Counter
import pandas as pd

# Ranking Original Data

In [4]:
# Load data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Split data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Create a list to store AUC values
auc_list = []

# Decision Tree
arbol = DecisionTreeClassifier(criterion='entropy', random_state=0)
arbol.fit(X_train, Y_train)
y_pred_test = arbol.predict(X_test)
auc_arbol = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Decision Tree", auc_arbol))

# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
y_pred_test = rfc.predict(X_test)
auc_rfc = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Random Forest", auc_rfc))

# KNN
clf_knn = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
clf_knn.fit(X_train, Y_train)
y_pred_test = clf_knn.predict(X_test)
auc_knn = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("KNN", auc_knn))

# Ridge Classifier
clf_ridge = RidgeClassifier().fit(X_train, Y_train)
y_pred_test = clf_ridge.predict(X_test)
auc_ridge = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ridge Classifier", auc_ridge))

# Gradient Boosting Classifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, Y_train)
y_pred_test = clf_gb.predict(X_test)
auc_gb = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Gradient Boosting", auc_gb))

# Bagging
bagging = BaggingClassifier(RandomForestClassifier())
bagging.fit(X_train, Y_train)
y_pred_test = bagging.predict(X_test)
auc_bagging = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Bagging", auc_bagging))

# MLP Classifier
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf_mlp.fit(X_train, Y_train)
y_pred_test = clf_mlp.predict(X_test)
auc_mlp = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("MLP Classifier", auc_mlp))

# Ada Boost
clf_ada = AdaBoostClassifier(n_estimators=100, random_state=0)
clf_ada.fit(X_train, Y_train)
y_pred_test = clf_ada.predict(X_test)
auc_ada = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ada Boost", auc_ada))

# Stacking
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))]
clf_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stack.fit(X_train, Y_train)
y_pred_test = clf_stack.predict(X_test)
auc_stack = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Stacking", auc_stack))

# Sort and print the AUC values

print("\n\n\n")
sorted_auc_list = sorted(auc_list, key=lambda x: x[1], reverse=True)
for model, auc_value in sorted_auc_list:
    print(f"{model} AUC: {auc_value}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)






Gradient Boosting AUC: 0.589476426152644
Random Forest AUC: 0.58661109663975
Ada Boost AUC: 0.58661109663975
Bagging AUC: 0.585178431883303
KNN AUC: 0.5454545454545454
Ridge Classifier AUC: 0.5454545454545454
Stacking AUC: 0.5454545454545454
MLP Classifier AUC: 0.5440218806980984
Decision Tree AUC: 0.5282625683771816


# UnderSample

In [6]:
# Load data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Split data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Undersample the majority class
rus = RandomUnderSampler(random_state=0)
X_resampled, Y_resampled = rus.fit_resample(X, Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=0)
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Create a list to store AUC values
auc_list = []

# Decision Tree
arbol = DecisionTreeClassifier(criterion='entropy', random_state=0)
arbol.fit(X_train, Y_train)
y_pred_test = arbol.predict(X_test)
auc_arbol = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Decision Tree", auc_arbol))

# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
y_pred_test = rfc.predict(X_test)
auc_rfc = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Random Forest", auc_rfc))

# KNN
clf_knn = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
clf_knn.fit(X_train, Y_train)
y_pred_test = clf_knn.predict(X_test)
auc_knn = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("KNN", auc_knn))

# Ridge Classifier
clf_ridge = RidgeClassifier().fit(X_train, Y_train)
y_pred_test = clf_ridge.predict(X_test)
auc_ridge = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ridge Classifier", auc_ridge))

# Gradient Boosting Classifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, Y_train)
y_pred_test = clf_gb.predict(X_test)
auc_gb = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Gradient Boosting", auc_gb))

# Bagging
bagging = BaggingClassifier(RandomForestClassifier())
bagging.fit(X_train, Y_train)
y_pred_test = bagging.predict(X_test)
auc_bagging = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Bagging", auc_bagging))

# MLP Classifier
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf_mlp.fit(X_train, Y_train)
y_pred_test = clf_mlp.predict(X_test)
auc_mlp = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("MLP Classifier", auc_mlp))

# Ada Boost
clf_ada = AdaBoostClassifier(n_estimators=100, random_state=0)
clf_ada.fit(X_train, Y_train)
y_pred_test = clf_ada.predict(X_test)
auc_ada = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ada Boost", auc_ada))

# Stacking
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))]
clf_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stack.fit(X_train, Y_train)
y_pred_test = clf_stack.predict(X_test)
auc_stack = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Stacking", auc_stack))

# Sort and print the AUC values

print("\n\n\n")
sorted_auc_list = sorted(auc_list, key=lambda x: x[1], reverse=True)
for model, auc_value in sorted_auc_list:
    print(f"{model} AUC: {auc_value}")


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)






Random Forest AUC: 0.7500000000000001
KNN AUC: 0.7142857142857143
Bagging AUC: 0.7142857142857143
MLP Classifier AUC: 0.6785714285714286
Ridge Classifier AUC: 0.6785714285714285
Stacking AUC: 0.6785714285714285
Decision Tree AUC: 0.6428571428571428
Gradient Boosting AUC: 0.6428571428571428
Ada Boost AUC: 0.6428571428571428


# OverSample

In [8]:
# Load data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Split data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Oversample the minority class
ros = RandomOverSampler(random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X, Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=0)
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Create a list to store AUC values
auc_list = []

# Decision Tree
arbol = DecisionTreeClassifier(criterion='entropy', random_state=0)
arbol.fit(X_train, Y_train)
y_pred_test = arbol.predict(X_test)
auc_arbol = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Decision Tree", auc_arbol))

# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
y_pred_test = rfc.predict(X_test)
auc_rfc = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Random Forest", auc_rfc))

# KNN
clf_knn = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
clf_knn.fit(X_train, Y_train)
y_pred_test = clf_knn.predict(X_test)
auc_knn = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("KNN", auc_knn))

# Ridge Classifier
clf_ridge = RidgeClassifier().fit(X_train, Y_train)
y_pred_test = clf_ridge.predict(X_test)
auc_ridge = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ridge Classifier", auc_ridge))

# Gradient Boosting Classifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, Y_train)
y_pred_test = clf_gb.predict(X_test)
auc_gb = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Gradient Boosting", auc_gb))

# Bagging
bagging = BaggingClassifier(RandomForestClassifier())
bagging.fit(X_train, Y_train)
y_pred_test = bagging.predict(X_test)
auc_bagging = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Bagging", auc_bagging))

# MLP Classifier
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf_mlp.fit(X_train, Y_train)
y_pred_test = clf_mlp.predict(X_test)
auc_mlp = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("MLP Classifier", auc_mlp))

# Ada Boost
clf_ada = AdaBoostClassifier(n_estimators=100, random_state=0)
clf_ada.fit(X_train, Y_train)
y_pred_test = clf_ada.predict(X_test)
auc_ada = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ada Boost", auc_ada))

# Stacking
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))]
clf_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stack.fit(X_train, Y_train)
y_pred_test = clf_stack.predict(X_test)
auc_stack = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Stacking", auc_stack))

# Sort and print the AUC values

print("\n\n\n")
sorted_auc_list = sorted(auc_list, key=lambda x: x[1], reverse=True)
for model, auc_value in sorted_auc_list:
    print(f"{model} AUC: {auc_value}")






Random Forest AUC: 0.9958563535911602
Stacking AUC: 0.9958563535911602
Bagging AUC: 0.9861878453038674
Decision Tree AUC: 0.9792817679558011
Gradient Boosting AUC: 0.9765193370165746
Ada Boost AUC: 0.9751381215469613
KNN AUC: 0.9640883977900552
Ridge Classifier AUC: 0.7688180143981249
MLP Classifier AUC: 0.7522769127741503


# SMOTE

In [11]:
# Load data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Split data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Apply SMOTE
smote = SMOTE(random_state=0)
X_resampled, Y_resampled = smote.fit_resample(X, Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=0)
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Create a list to store AUC values
auc_list = []

# Decision Tree
arbol = DecisionTreeClassifier(criterion='entropy', random_state=0)
arbol.fit(X_train, Y_train)
y_pred_test = arbol.predict(X_test)
auc_arbol = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Decision Tree", auc_arbol))

# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
y_pred_test = rfc.predict(X_test)
auc_rfc = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Random Forest", auc_rfc))

# KNN
clf_knn = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
clf_knn.fit(X_train, Y_train)
y_pred_test = clf_knn.predict(X_test)
auc_knn = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("KNN", auc_knn))

# Ridge Classifier
clf_ridge = RidgeClassifier().fit(X_train, Y_train)
y_pred_test = clf_ridge.predict(X_test)
auc_ridge = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ridge Classifier", auc_ridge))

# Gradient Boosting Classifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, Y_train)
y_pred_test = clf_gb.predict(X_test)
auc_gb = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Gradient Boosting", auc_gb))

# Bagging
bagging = BaggingClassifier(RandomForestClassifier())
bagging.fit(X_train, Y_train)
y_pred_test = bagging.predict(X_test)
auc_bagging = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Bagging", auc_bagging))

# MLP Classifier
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf_mlp.fit(X_train, Y_train)
y_pred_test = clf_mlp.predict(X_test)
auc_mlp = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("MLP Classifier", auc_mlp))

# Ada Boost
clf_ada = AdaBoostClassifier(n_estimators=100, random_state=0)
clf_ada.fit(X_train, Y_train)
y_pred_test = clf_ada.predict(X_test)
auc_ada = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ada Boost", auc_ada))

# Stacking
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))]
clf_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stack.fit(X_train, Y_train)
y_pred_test = clf_stack.predict(X_test)
auc_stack = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Stacking", auc_stack))

# Sort and print the AUC values

print("\n\n\n")
sorted_auc_list = sorted(auc_list, key=lambda x: x[1], reverse=True)
for model, auc_value in sorted_auc_list:
    print(f"{model} AUC: {auc_value}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)






Random Forest AUC: 0.9704587309559685
Bagging AUC: 0.9606562866231375
Stacking AUC: 0.9574920475472961
Decision Tree AUC: 0.9276829064121882
KNN AUC: 0.9128494893688264
Gradient Boosting AUC: 0.9041101623974552
Ada Boost AUC: 0.8973798761091578
MLP Classifier AUC: 0.8341955466264858
Ridge Classifier AUC: 0.7725179976561192


# ADASYN

In [13]:
# Load data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Split data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Apply ADASYN
adasyn = ADASYN(random_state=0)
X_resampled, Y_resampled = adasyn.fit_resample(X, Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=0)
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Create a list to store AUC values
auc_list = []

# Decision Tree
arbol = DecisionTreeClassifier(criterion='entropy', random_state=0)
arbol.fit(X_train, Y_train)
y_pred_test = arbol.predict(X_test)
auc_arbol = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Decision Tree", auc_arbol))

# Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
y_pred_test = rfc.predict(X_test)
auc_rfc = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Random Forest", auc_rfc))

# KNN
clf_knn = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
clf_knn.fit(X_train, Y_train)
y_pred_test = clf_knn.predict(X_test)
auc_knn = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("KNN", auc_knn))

# Ridge Classifier
clf_ridge = RidgeClassifier().fit(X_train, Y_train)
y_pred_test = clf_ridge.predict(X_test)
auc_ridge = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ridge Classifier", auc_ridge))

# Gradient Boosting Classifier
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, Y_train)
y_pred_test = clf_gb.predict(X_test)
auc_gb = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Gradient Boosting", auc_gb))

# Bagging
bagging = BaggingClassifier(RandomForestClassifier())
bagging.fit(X_train, Y_train)
y_pred_test = bagging.predict(X_test)
auc_bagging = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Bagging", auc_bagging))

# MLP Classifier
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf_mlp.fit(X_train, Y_train)
y_pred_test = clf_mlp.predict(X_test)
auc_mlp = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("MLP Classifier", auc_mlp))

# Ada Boost
clf_ada = AdaBoostClassifier(n_estimators=100, random_state=0)
clf_ada.fit(X_train, Y_train)
y_pred_test = clf_ada.predict(X_test)
auc_ada = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Ada Boost", auc_ada))

# Stacking
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('gbc', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))]
clf_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stack.fit(X_train, Y_train)
y_pred_test = clf_stack.predict(X_test)
auc_stack = roc_auc_score(Y_test, y_pred_test)
auc_list.append(("Stacking", auc_stack))

# Sort and print the AUC values

print("\n\n\n")
sorted_auc_list = sorted(auc_list, key=lambda x: x[1], reverse=True)
for model, auc_value in sorted_auc_list:
    print(f"{model} AUC: {auc_value}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)






Random Forest AUC: 0.9689034369885433
Stacking AUC: 0.9573299041384148
Bagging AUC: 0.9535012859480945
Decision Tree AUC: 0.9247135842880524
Gradient Boosting AUC: 0.9128477905073649
KNN AUC: 0.9021802665419687
Ada Boost AUC: 0.9019172317044657
MLP Classifier AUC: 0.8588087444470424
Ridge Classifier AUC: 0.7324058919803601
