# Modelisation

## Importation des librairies et dataset

In [85]:
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [86]:
with open("datasets/dataset.pkl", "rb") as file:
    data = pickle.load(file)
file.close()

In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 897167 entries, 0 to 899163
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Name           897153 non-null  object        
 1   City           897137 non-null  object        
 2   State          897154 non-null  category      
 3   Zip            897167 non-null  object        
 4   Bank           895661 non-null  object        
 5   BankState      895654 non-null  category      
 6   NAICS          897167 non-null  object        
 7   ApprovalDate   897167 non-null  datetime64[ns]
 8   ApprovalFY     897167 non-null  category      
 9   Term           897167 non-null  int64         
 10  NoEmp          897167 non-null  int64         
 11  NewExist       897033 non-null  category      
 12  CreateJob      897167 non-null  int64         
 13  RetainedJob    897167 non-null  int64         
 14  FranchiseCode  897167 non-null  object        
 15  Urban

## Création des sets

In [88]:
data = data.dropna()

data = data.drop("ApprovalDate", axis=1)

In [89]:
X = data.drop("MIS_Status", axis=1)
y = data.MIS_Status

object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["ApprovalFY"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)

# year_col = ["ApprovalFY"]

for col in object_cols:
    frequency_encoding = X[col].value_counts(normalize=True)
    X[col] = X[col].map(frequency_encoding)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, stratify=y, random_state=42)

## Preprocessing

In [90]:
preprocessing = ColumnTransformer([
        ("passthrough", "passthrough", object_cols),
        ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False), cat_cols),
        ("scaler", StandardScaler(), num_cols),
        # ("ordinal", OrdinalEncoder(handle_unknown="error"), year_col)
    ]
)

## Pipeline

In [91]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

pipeline_rf = make_pipeline(preprocessing, rf)

pipeline_rf.fit(X_train, y_train)

In [92]:
pred_test_rf = pipeline_rf.predict(X_test)

print(classification_report(y_test, pred_test_rf))

ValueError: Found unknown categories ['1968'] in column 0 during transform