# Modelisation

## Importation des librairies et dataset

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
with open("datasets/dataset_withoutBadValues.pkl", "rb") as file:
    data = pickle.load(file)
file.close()

data.rename(columns={'Zip_2': 'Zip2', 'NAICS_2': 'NAICS2'}, inplace=True)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 693993 entries, 0 to 899161
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Name           693993 non-null  object        
 1   City           693993 non-null  object        
 2   State          693993 non-null  category      
 3   Zip            693993 non-null  object        
 4   Bank           693993 non-null  object        
 5   BankState      693993 non-null  category      
 6   NAICS          693993 non-null  object        
 7   ApprovalDate   693993 non-null  datetime64[ns]
 8   ApprovalFY     693993 non-null  category      
 9   Term           693993 non-null  int64         
 10  NoEmp          693993 non-null  int64         
 11  NewExist       693993 non-null  category      
 12  CreateJob      693993 non-null  int64         
 13  RetainedJob    693993 non-null  int64         
 14  FranchiseCode  693993 non-null  object        
 15  Urban

In [4]:
data["MIS_Status"].value_counts(normalize=True)

MIS_Status
1    0.797273
0    0.202727
Name: proportion, dtype: float64

## Création des sets

In [5]:
data = data.drop("ApprovalDate", axis=1)
data = data.drop("ApprovalFY", axis=1)

In [6]:
X = data.drop("MIS_Status", axis=1)
y = data.MIS_Status

In [7]:
# Récupère le nom des colonnes par types
object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns)#.drop(["ApprovalFY"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
#year_col = ["ApprovalFY"]

# Ordinal encoding for ApprovalFY
#unique_years = sorted(data["ApprovalFY"].unique())

# Frequency encoding for object columns
for col in object_cols:
    frequency_encoding = X[col].value_counts(normalize=True)
    X[col] = X[col].map(frequency_encoding)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, stratify=y, random_state=42)

### Oversampling

In [138]:
sample_strat = 0.5
oversample = RandomOverSampler(sampling_strategy=sample_strat)

X_over, y_over = oversample.fit_resample(X_train, y_train)

y_over.value_counts(normalize=True)

MIS_Status
1    0.666667
0    0.333333
Name: proportion, dtype: float64

## Preprocessing

### Column Transformer

In [9]:
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), cat_cols),
        ("scaler", StandardScaler(), num_cols),
        #("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=-1), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

### Imbalanced computation

In [83]:
# classes = y_train.unique()
# class_weights = compute_class_weight("balanced", classes=classes, y=y_train)

# sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

In [100]:
catb = CatBoostClassifier(one_hot_max_size=10, verbose=0, random_state=42)

pipeline_cb = make_pipeline(preprocessing, catb)

pipeline_cb.fit(X_over, y_over)

In [101]:
pred_test_cb = pipeline_cb.predict(X_test)

print(classification_report(y_test, pred_test_cb, digits=3))

              precision    recall  f1-score   support

           0      0.873     0.921     0.896      7035
           1      0.980     0.966     0.973     27665

    accuracy                          0.957     34700
   macro avg      0.926     0.943     0.934     34700
weighted avg      0.958     0.957     0.957     34700



## Pipeline

In [10]:
catb = CatBoostClassifier(one_hot_max_size=10, verbose=0, random_state=42) # class_weights={k: class_weights[k] for k in classes}

pipeline_cb = make_pipeline(preprocessing, catb)

pipeline_cb.fit(X_train, y_train)

In [11]:
pred_test_cb = pipeline_cb.predict(X_test)

print(classification_report(y_test, pred_test_cb, digits=3))

              precision    recall  f1-score   support

           0      0.901     0.878     0.889      7035
           1      0.969     0.975     0.972     27665

    accuracy                          0.956     34700
   macro avg      0.935     0.927     0.931     34700
weighted avg      0.955     0.956     0.956     34700



## Feature importance

In [142]:
print(pipeline_cb.steps)

[('columntransformer', ColumnTransformer(remainder='passthrough',
                  transformers=[('onehot',
                                 OneHotEncoder(drop='if_binary',
                                               handle_unknown='ignore',
                                               sparse_output=False),
                                 ['State', 'BankState', 'NewExist',
                                  'UrbanRural', 'RevLineCr', 'LowDoc', 'Zip2',
                                  'NAICS2', 'Franchised', 'RealEstate']),
                                ('scaler', StandardScaler(),
                                 ['Term', 'NoEmp', 'CreateJob', 'RetainedJob',
                                  'GrAppv', 'SBA_Appv'])],
                  verbose_feature_names_out=False)), ('catboostclassifier', <catboost.core.CatBoostClassifier object at 0x7fbd494cbd30>)]


In [143]:
preprocessed_features = pipeline_cb.named_steps['columntransformer'].get_feature_names_out()
cb_model = pipeline_cb.named_steps['catboostclassifier']

aggregated_scores = {}

for i, feature_name in enumerate(preprocessed_features):
    if '_' in feature_name:
        original_feature = feature_name.split('_')[0]
        # Add importance score to the aggregated score for the original feature
        aggregated_scores[original_feature] = aggregated_scores.get(original_feature, 0) + cb_model.feature_importances_[i]
    else:
        # Add importance score directly for non-one-hot encoded features
        aggregated_scores[feature_name] = cb_model.feature_importances_[i]

aggregated_scores_df = pd.DataFrame(aggregated_scores.items(), columns=['features', 'score'])

aggregated_scores_df.sort_values(by=['score'], ascending=False, inplace=True)

display(aggregated_scores_df)

# feat_imp = pd.DataFrame({'features': preprocessed_features, 'score': cb_model.feature_importances_})
# feat_imp.sort_values(by=['score'], ascending=False, inplace=True)
# display(feat_imp.head(20))

Unnamed: 0,features,score
10,Term,54.477971
21,ApprovalFY,8.486229
19,Bank,8.277836
1,BankState,7.145043
4,RevLineCr,6.009952
15,SBA,2.931803
9,RealEstate,2.319449
14,GrAppv,1.846362
0,State,1.701769
7,NAICS2,1.256244


In [102]:
display(preprocessed_features)

array(['State_AK', 'State_AL', 'State_AR', 'State_AZ', 'State_CA',
       'State_CO', 'State_CT', 'State_DC', 'State_DE', 'State_FL',
       'State_GA', 'State_HI', 'State_IA', 'State_ID', 'State_IL',
       'State_IN', 'State_KS', 'State_KY', 'State_LA', 'State_MA',
       'State_MD', 'State_ME', 'State_MI', 'State_MN', 'State_MO',
       'State_MS', 'State_MT', 'State_NC', 'State_ND', 'State_NE',
       'State_NH', 'State_NJ', 'State_NM', 'State_NV', 'State_NY',
       'State_OH', 'State_OK', 'State_OR', 'State_PA', 'State_RI',
       'State_SC', 'State_SD', 'State_TN', 'State_TX', 'State_UT',
       'State_VA', 'State_VT', 'State_WA', 'State_WI', 'State_WV',
       'State_WY', 'BankState_AK', 'BankState_AL', 'BankState_AN',
       'BankState_AR', 'BankState_AZ', 'BankState_CA', 'BankState_CO',
       'BankState_CT', 'BankState_DC', 'BankState_DE', 'BankState_FL',
       'BankState_GA', 'BankState_GU', 'BankState_HI', 'BankState_IA',
       'BankState_ID', 'BankState_IL', 'BankState_