# Ejercicio: Clasificación de Quiebras.

In [None]:
pip install imbalanced-learn

In [None]:
import pandas as pd

- Para este ejercicio usaremos la siguiente base de datos: https://www.kaggle.com/datasets/fedesoriano/company-bankruptcy-prediction?resource=download

In [None]:
df = pd.read_csv('data/data.csv')
df

Realiza los siguientes apartados:
- Explora la base de datos: ¿Está balanceada?, ¿las features están correladas?, etc.
- Divide la base de datos en test y train utilizando la función train_test_split de sklearn, dejando un 20% de las muestras para test. Esta division tiene que ser la misma para los siguientes ejercicios que realizaremos durante la clase.
- Normaliza los datos usando StandardScaler
- Entrena un modelo GradientBoostingClassifier de sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
- Valida el resultado en el conjunto de test en términos de acc, matriz de confusión, ROC, etc. ¿qué observas?
- Rebalancea las muestras de entrenamiento usando la técnica SMOTE. Para ello usa el siguiente código y repite las partes anteriores. Necesitas instalar con ```pip install imblearn```y reiniciar el kernel. Puedes encontrar más información del método en: https://imbalanced-learn.org/stable/over_sampling.html#smote-adasyn
```python
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)
```


In [None]:
df.info()

In [None]:
y = df.iloc[:, 0]

In [None]:
x = df.iloc[:, 1:]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
y.value_counts()

In [None]:
sns.countplot(y)
plt.title('Target feature - Bankrupt?')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [None]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(
    n_estimators=100, learning_rate=1.0, 
    max_depth=1, random_state=0).fit(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
cm = (cm.T / cm.sum(axis=1)).T

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
y_pred_prob = clf.predict_proba(x_test)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_pred_prob[:, 1])
auc_value = auc(test_fpr, test_tpr)

plt.grid()
plt.plot(test_fpr, test_tpr, label=f" AUC TEST = {auc_value}")
plt.plot([0,1],[0,1],'g--')
plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC(ROC curve)")
plt.grid(color='black', linestyle='-', linewidth=0.5)
plt.show()


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [None]:
sns.countplot(y_train)
plt.title('Target feature - Bankrupt?')

In [None]:
clf = GradientBoostingClassifier(
    n_estimators=100, learning_rate=1.0, 
    max_depth=1, random_state=0).fit(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
cm.sum(axis=1)

In [None]:
cm = (cm.T / cm.sum(axis=1)).T

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
y_pred_prob

In [None]:
y_pred_prob = clf.predict_proba(x_test)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_pred_prob[:, 1])
auc_value = auc(test_fpr, test_tpr)

plt.grid()
plt.plot(test_fpr, test_tpr, label=f" AUC TEST = {auc_value}")
plt.plot([0,1],[0,1],'g--')
plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC(ROC curve)")
plt.grid(color='black', linestyle='-', linewidth=0.5)
plt.show()