# Modelisation

## Importation des librairies et dataset

In [13]:
import pandas as pd
import numpy as np
import pickle

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [14]:
with open("datasets/dataset_withoutBadValues.pkl", "rb") as file:
    data = pickle.load(file)
file.close()

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 693993 entries, 0 to 899161
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Name           693993 non-null  object        
 1   City           693993 non-null  object        
 2   State          693993 non-null  category      
 3   Zip            693993 non-null  object        
 4   Bank           693993 non-null  object        
 5   BankState      693993 non-null  category      
 6   NAICS          693993 non-null  object        
 7   ApprovalDate   693993 non-null  datetime64[ns]
 8   ApprovalFY     693993 non-null  category      
 9   Term           693993 non-null  int64         
 10  NoEmp          693993 non-null  int64         
 11  NewExist       693993 non-null  category      
 12  CreateJob      693993 non-null  int64         
 13  RetainedJob    693993 non-null  int64         
 14  FranchiseCode  693993 non-null  object        
 15  Urban

## Création des sets

In [16]:
data = data.dropna()
data = data.drop("ApprovalDate", axis=1)

In [17]:
X = data.drop("MIS_Status", axis=1)
y = data.MIS_Status

In [18]:
# Récupère le nom des colonnes par types
object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["ApprovalFY"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["ApprovalFY"]

# Ordinal encoding for ApprovalFY
unique_years = sorted(data["ApprovalFY"].unique())

# Frequency encoding for object columns
for col in object_cols:
    frequency_encoding = X[col].value_counts(normalize=True)
    X[col] = X[col].map(frequency_encoding)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, stratify=y, random_state=42)

## Preprocessing

In [20]:
# class FrequencyTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, columns):
#         self.columns = columns
#         self.frequency_encodings = {}

#     def fit(self, X, y=None):
#         for col in self.columns:
#             frequency_encoding = X[col].value_counts(normalize=True)
#             self.frequency_encodings[col] = frequency_encoding
#         return self

#     def transform(self, X):
#         X_copy = X.copy()
#         for col in self.columns:
#             frequency_encoding = self.frequency_encodings[col]
#             X_copy[col] = X_copy[col].map(frequency_encoding)
#         return X_copy
    
#     def get_feature_names_out(self, input_features=None):
#         return [self.column]

In [21]:
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), cat_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=-1), year_col),
        # ("frequency", FrequencyTransformer(columns=cat_cols), cat_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

## Pipeline

In [22]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

pipeline_rf = make_pipeline(preprocessing, rf)

pipeline_rf.fit(X_train, y_train)

In [23]:
pred_test_rf = pipeline_rf.predict(X_test)

print(classification_report(y_test, pred_test_rf))

              precision    recall  f1-score   support

           0       0.93      0.34      0.50      7035
           1       0.86      0.99      0.92     27665

    accuracy                           0.86     34700
   macro avg       0.89      0.67      0.71     34700
weighted avg       0.87      0.86      0.83     34700



## Feature importance

In [27]:
print(pipeline_rf.steps)

[('columntransformer', ColumnTransformer(remainder='passthrough',
                  transformers=[('onehot',
                                 OneHotEncoder(drop='if_binary',
                                               handle_unknown='ignore',
                                               sparse_output=False),
                                 ['State', 'BankState', 'NewExist',
                                  'UrbanRural', 'RevLineCr', 'LowDoc', 'Zip_2',
                                  'NAICS_2', 'Franchised', 'RealEstate']),
                                ('scaler', StandardScaler(),
                                 ['Term', 'NoEmp', 'CreateJob', 'RetainedJob',
                                  'GrAppv', 'SBA_Appv']),
                                ('ordinal',
                                 OrdinalEncoder(categories=[['1984', '1987',
                                                             '1988', '1989',
                                                             '1990

In [32]:
preprocessed_features = pipeline_rf.named_steps['columntransformer'].get_feature_names_out()
rf_model = pipeline_rf.named_steps['randomforestclassifier']

f = pd.DataFrame({'features': preprocessed_features, 'score': rf_model.feature_importances_})
f.sort_values(by=['score'], ascending=False, inplace=True)
display(f.head(20))

Unnamed: 0,features,score
242,Term,0.434602
248,ApprovalFY,0.084809
252,Bank,0.07112
247,SBA_Appv,0.05539
241,RealEstate_1,0.043342
254,FranchiseCode,0.042517
245,RetainedJob,0.039632
246,GrAppv,0.036341
107,UrbanRural_0,0.027174
108,UrbanRural_1,0.015956
