# Modelisation

## Importation des librairies et dataset

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from catboost import CatBoostClassifier
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.selection import DropFeatures
from imblearn.over_sampling import RandomOverSampler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data_lg = pd.read_pickle("datasets/dataset.pkl")
data_sm = pd.read_pickle("datasets/dataset_withoutBadValues.pkl")

In [3]:
data = data_sm
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 473218 entries, 0 to 899161
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Name           473218 non-null  object        
 1   City           473218 non-null  object        
 2   State          473218 non-null  category      
 3   Zip            473218 non-null  object        
 4   Bank           473218 non-null  object        
 5   BankState      473218 non-null  category      
 6   NAICS          473218 non-null  object        
 7   ApprovalDate   473218 non-null  datetime64[ns]
 8   ApprovalFY     473218 non-null  category      
 9   Term           473218 non-null  int64         
 10  NoEmp          473218 non-null  int64         
 11  NewExist       473218 non-null  category      
 12  CreateJob      473218 non-null  int64         
 13  RetainedJob    473218 non-null  int64         
 14  FranchiseCode  473218 non-null  object        
 15  Urban

## Création des sets

In [4]:
X = data.drop("MIS_Status", axis=1)
y = data.MIS_Status

In [5]:
cols_drop = ["Name", "City", "Zip", "NAICS", "ApprovalDate", "ApprovalFY", "CreateJob", "RetainedJob", "Franchised"]

X = X.drop(cols_drop, axis=1)

In [6]:
# Récupère le nom des colonnes par types
object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns)#.drop(["ApprovalFY"]))
num_cols = list(X.select_dtypes(include=["int32", "int64"]).columns)
#year_col = ["ApprovalFY"]

# Ordinal encoding for ApprovalFY
#unique_years = sorted(data["ApprovalFY"].unique())

# Frequency encoding for object columns
# for col in object_cols:
#     frequency_encoding = X[col].value_counts(normalize=True)
#     X[col] = X[col].map(frequency_encoding)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, stratify=y, random_state=42)

### Oversampling

In [8]:
data["MIS_Status"].value_counts(normalize=True)

MIS_Status
1    0.77799
0    0.22201
Name: proportion, dtype: float64

In [9]:
sample_strat = 0.65
oversample = RandomOverSampler(sampling_strategy=sample_strat)

X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)

y_oversample.value_counts(normalize=True)

MIS_Status
1    0.606061
0    0.393939
Name: proportion, dtype: float64

## Preprocessing

### Column Transformer

In [10]:
preprocessing = ColumnTransformer([
        ("frequency", CountFrequencyEncoder(encoding_method="frequency", missing_values="ignore"), object_cols),
        ("scaler", StandardScaler(), num_cols),
        #("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=-1), year_col),
        #("onehot", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), cat_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

### Getting indices for categorical features after preprocessing

In [11]:
pre_fit = preprocessing.fit(X_train)
fit_cols = pre_fit.get_feature_names_out()

cat_indices = []
for i, col_name in enumerate(fit_cols):
    if col_name in cat_cols:
        cat_indices.append(i)

  return is_numeric(pd.to_numeric(column, errors="ignore"))
  return is_numeric(pd.to_numeric(column, errors="ignore"))


## Pipeline

In [12]:
catb = CatBoostClassifier(one_hot_max_size=70, verbose=0, cat_features=cat_indices, random_state=42)

pipeline_cb = make_pipeline(preprocessing, catb)

### Normal fit

In [None]:
pipeline_cb.fit(X_train, y_train)

  return is_numeric(pd.to_numeric(column, errors="ignore"))
  return is_numeric(pd.to_numeric(column, errors="ignore"))


### Oversample fit

In [13]:
pipeline_cb_oversample = pipeline_cb
pipeline_cb_oversample.fit(X_oversample, y_oversample)

  return is_numeric(pd.to_numeric(column, errors="ignore"))
  return is_numeric(pd.to_numeric(column, errors="ignore"))


## Métriques

### Classification report normal fit

In [14]:
pred_test = pipeline_cb.predict(X_test)

print(classification_report(y_test, pred_test, digits=3))

              precision    recall  f1-score   support

           0      0.866     0.930     0.897      5253
           1      0.980     0.959     0.969     18408

    accuracy                          0.952     23661
   macro avg      0.923     0.945     0.933     23661
weighted avg      0.954     0.952     0.953     23661





### Classification report oversample fit

In [15]:
pred_test_oversample = pipeline_cb_oversample.predict(X_test)

print(classification_report(y_test, pred_test_oversample, digits=3))

              precision    recall  f1-score   support

           0      0.866     0.930     0.897      5253
           1      0.980     0.959     0.969     18408

    accuracy                          0.952     23661
   macro avg      0.923     0.945     0.933     23661
weighted avg      0.954     0.952     0.953     23661





## Feature importance

In [16]:
print(pipeline_cb.steps)

[('columntransformer', ColumnTransformer(remainder='passthrough',
                  transformers=[('frequency',
                                 CountFrequencyEncoder(encoding_method='frequency',
                                                       missing_values='ignore'),
                                 ['Bank', 'FranchiseCode']),
                                ('scaler', StandardScaler(),
                                 ['Term', 'NoEmp', 'GrAppv', 'SBA_Appv'])],
                  verbose_feature_names_out=False)), ('catboostclassifier', <catboost.core.CatBoostClassifier object at 0x7fc26243faf0>)]


In [17]:
preprocessed_features = pipeline_cb_oversample.named_steps['columntransformer'].get_feature_names_out()
cb_model = pipeline_cb_oversample.named_steps['catboostclassifier']

# aggregated_scores = {}

# for i, feature_name in enumerate(preprocessed_features):
#     if '_' in feature_name:
#         original_feature = feature_name.split('_')[0]
#         # Add importance score to the aggregated score for the original feature
#         aggregated_scores[original_feature] = aggregated_scores.get(original_feature, 0) + cb_model.feature_importances_[i]
#     else:
#         # Add importance score directly for non-one-hot encoded features
#         aggregated_scores[feature_name] = cb_model.feature_importances_[i]

# aggregated_scores_df = pd.DataFrame(aggregated_scores.items(), columns=['features', 'score'])

# aggregated_scores_df.sort_values(by=['score'], ascending=False, inplace=True)

# display(aggregated_scores_df)

feat_imp = pd.DataFrame({'features': preprocessed_features, 'score': cb_model.feature_importances_})
feat_imp.sort_values(by=['score'], ascending=False, inplace=True)
display(feat_imp)

Unnamed: 0,features,score
2,Term,56.415278
0,Bank,11.384459
7,BankState,10.940816
10,RevLineCr,3.925834
9,UrbanRural,3.618882
5,SBA_Appv,3.462549
6,State,2.542238
4,GrAppv,2.208656
13,NAICS2,1.661048
12,Zip2,1.25661


## Exportation du model

In [18]:
with open("model/model_cb.pkl", "wb") as f:
  pickle.dump(pipeline_cb_oversample, f)
f.close()