# Importer les packages

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import sweetviz as sv
import missingno as msno
import time 
import sys
import os
import csv

import requests
import re

from sklearn.compose import make_column_selector, ColumnTransformer, make_column_transformer
# sklearn.compose: The sklearn.compose module is a submodule of the sklearn library for machine learning in Python. It provides functions for creating complex preprocessing and modeling pipelines.
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,RobustScaler
#sklearn.preprocessing: The sklearn.preprocessing module is a submodule of the sklearn library that provides functions for preprocessing data, such as scaling and normalizing features, imputing missing values, and encoding categorical variables.
from sklearn.linear_model import Ridge,LinearRegression,Lasso, ElasticNet
# sklearn.linear_model: The sklearn.linear_model module is a submodule of the sklearn library that provides functions for fitting linear models for regression and classification.
from sklearn.pipeline import make_pipeline
# sklearn.pipeline: The sklearn.pipeline module is a submodule of the sklearn library that provides functions for creating and working with pipelines of transformers and models.
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve, RandomizedSearchCV, cross_val_score, KFold
# sklearn.model_selection: The sklearn.model_selection module is a submodule of the sklearn library that provides functions for splitting data into training and test sets, evaluating models using cross-validation, and hyperparameter tuning.
from sklearn.dummy import DummyRegressor
# sklearn.dummy: The sklearn.dummy module is a submodule of the sklearn library that provides simple dummy models for regression and classification.


from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.ensemble import RandomForestClassifier
import pickle


# Lire les données

In [10]:
df=pd.read_csv("/home/apprenant/Documents/DATA/loan_project/SBAnational.csv")

df['ApprovalFY'] = df['ApprovalFY'].replace('A', '', regex=True).astype(int)

  df=pd.read_csv("/home/apprenant/Documents/DATA/loan_project/SBAnational.csv")


# Nettoyage des données

In [11]:
# create a list of all numeric columns
columns_to_transform_to_int = ["DisbursementGross","BalanceGross","ChgOffPrinGr","GrAppv","SBA_Appv"]


for col in columns_to_transform_to_int:
    df[col] = df[col].str.replace("$", "")
    df[col] = df[col].str.replace(",", "")
    df[col] = df[col].astype(float)
    

df['Term'] = df['Term'].astype(int)

df['LowDoc'] = df['LowDoc'].replace({'0': 'N'})
df = df[df['LowDoc'].isin(['N','Y'])]
df['RevLineCr'] = df['RevLineCr'].replace({'0': 'N', 'T':'Y'})
df = df[df['RevLineCr'].isin(['N','Y'])]

dictionnaire =  {"11":"Agriculture, forestry, fishing and hunting","72":"Accommodation and food services","21":"Mining, quarrying, and oil and gas extraction","22":
"Utilities","23":"Construction","31":"Manufacturing","32":"Manufacturing","33":"Manufacturing","42":"Wholesale trade","44":"Retail trade","45":"Retail trade","48":" Transportation and warehousing", "49":"Transportation and warehousing", "51":"Information","52":"Finance and insurance", "53":"Real estate and rental and leasing","54":"Professional, scientific, and technical services","55":"Management of companies and enterprises","56":"Administrative and support and waste management and remediation services","61":"Educational services","62":"Health care and social assistance","71":"Arts, entertainment, and recreation","81":"Other services (except public administration)", "92": "Public administration", "0" : "Other"}

df['NAICS'] = df['NAICS'].astype(str).str[:2]

df["NAICS"] = df["NAICS"].map(dictionnaire)

cols_to_drop = ['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'BankState', 'ApprovalDate', 'RetainedJob','ChgOffDate', 'DisbursementDate', 'DisbursementGross', 'BalanceGross', 'SBA_Appv', 'ChgOffPrinGr']
df_cleaned = df.copy()
df_cleaned.drop(columns=cols_to_drop, inplace=True)

df_cleaned.dropna(subset=['MIS_Status'], inplace=True)


  df[col] = df[col].str.replace("$", "")


# Préparation à la modélisation

In [12]:
y = df_cleaned.MIS_Status
X = df_cleaned.drop(columns=['MIS_Status'])

# Pipeline

In [13]:
def make_pipeline_to_ML(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.8, random_state=42)
    numerical_features = make_column_selector(dtype_include=np.number)
    categorical_features = make_column_selector(dtype_exclude= np.number)
    numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'),StandardScaler(with_mean=False))
    categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore'))
    preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
    return preprocessor, X_train, X_test, y_train, y_test


# Create the pipeline
preprocessor, X_train, X_test, y_train, y_test = make_pipeline_to_ML(X,y)

# Logistic Regression
![Logistic](https://miro.medium.com/max/1400/0*q3wC98LKNGFSFwdG.gif)

# les paramètres 
- `C` : C est un coefficient de régularisation qui contrôle la force de la pénalité dans la régression logistique. Plus C est grand, plus la régularisation est faible, ce qui peut conduire à un modèle plus complexe et à un ajustement sur-adapté.
- `penalty` : spécifie la forme de la pénalité pour la régularisation, 'l1' pour la pénalité L1 (régularisation Lasso) et 'l2' pour la pénalité L2 (régularisation Ridge).
- `solver` : 'saga' est un algorithme de résolution utilisé pour la régression logistique.
- `max_iter` : spécifie le nombre maximal d'itérations pour la convergence de l'algorithme.
- `fit_intercept` : spécifie si le modèle doit inclure une constante (intercept) ou non.
- `tol` : spécifie la tolérance pour la convergence de l'algorithme, c'est-à-dire l'erreur maximale admissible.

In [14]:
from sklearn.linear_model import LogisticRegression

# Add LogisticRegression to the pipeline
pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=10000, class_weight='balanced', n_jobs=-1))
print("pipeline ... OK")
print("---"*20)
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)
print("---"*20)
print("Fit ... OK")
# Make predictions on the test data
y_pred = pipeline.predict(X_test)
print("---"*20)
print("Predict ... OK")
score = pipeline.score(X_test, y_test)
print('Accuracy:', score)

from sklearn.metrics import classification_report, confusion_matrix, f1_score

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("f1_score_average : Macro :",f1_score(y_test, y_pred, average='macro'))

pipeline ... OK
------------------------------------------------------------
------------------------------------------------------------
Fit ... OK
------------------------------------------------------------
Predict ... OK
Accuracy: 0.7151141513948709
              precision    recall  f1-score   support

      CHGOFF       0.36      0.82      0.50     31281
       P I F       0.95      0.69      0.80    146334

    accuracy                           0.72    177615
   macro avg       0.66      0.76      0.65    177615
weighted avg       0.84      0.72      0.75    177615

[[ 25701   5580]
 [ 45020 101314]]
f1_score_average : Macro : 0.6520556851690158


In [15]:
from sklearn.linear_model import LogisticRegression

# Add LogisticRegression to the pipeline
pipeline = make_pipeline(preprocessor, LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=10000, class_weight='balanced', n_jobs=-1))

print("pipeline ... OK")
print("---"*20)
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)
print("---"*20)
print("Fit ... OK")
# Make predictions on the test data
y_pred = pipeline.predict(X_test)
print("---"*20)
print("Predict ... OK")
score = pipeline.score(X_test, y_test)
print('Accuracy:', score)

from sklearn.metrics import classification_report, confusion_matrix, f1_score

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("f1_score_average : Macro :",f1_score(y_test, y_pred, average='macro'))

pipeline ... OK
------------------------------------------------------------
------------------------------------------------------------
Fit ... OK
------------------------------------------------------------
Predict ... OK
Accuracy: 0.7151141513948709
              precision    recall  f1-score   support

      CHGOFF       0.36      0.82      0.50     31281
       P I F       0.95      0.69      0.80    146334

    accuracy                           0.72    177615
   macro avg       0.66      0.76      0.65    177615
weighted avg       0.84      0.72      0.75    177615

[[ 25701   5580]
 [ 45020 101314]]
f1_score_average : Macro : 0.6520556851690158


# Sauvegarder pkl

In [16]:
import pickle

# Save the model to a file
with open("LogisREG.pkl", "wb") as file:
    pickle.dump(pipeline, file)