In [10]:
# notebook_training.ipynb

import pandas as pd
import numpy as np
import joblib  # Pour sauvegarder le modèle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
# 1. Chargement
df = pd.read_csv("AER_credit_card_data.csv")

In [17]:
# voir le nombre de ligne et de colonnes
df.shape
# voir les informations sur le dataset
df.info()
#connaitre le type
type(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   int64  
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   int64  
 7   selfemp      1319 non-null   int64  
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(8)
memory usage: 123.8 KB


pandas.core.frame.DataFrame

In [18]:
# voir les colones
df.columns
# check le type de colonnes
df.dtypes

card             int64
reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner            int64
selfemp          int64
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object

In [19]:
#cheker les donnee manquante
df.isna().sum()

card           0
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64

In [20]:
# reperer les doublons et les compter
df.duplicated().value_counts()

False    1319
Name: count, dtype: int64

In [21]:
df.describe()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,0.775588,0.456406,33.213103,3.365376,0.068732,185.057071,0.440485,0.068992,0.993935,55.267627,0.817286,6.996967
std,0.417353,1.345267,10.142783,1.693902,0.094656,272.218917,0.496634,0.253536,1.247745,66.271746,0.386579,6.305812
min,0.0,0.0,0.166667,0.21,0.000109,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,25.41667,2.24375,0.002316,4.583333,0.0,0.0,0.0,12.0,1.0,2.0
50%,1.0,0.0,31.25,2.9,0.038827,101.2983,0.0,0.0,1.0,30.0,1.0,6.0
75%,1.0,0.0,39.41667,4.0,0.093617,249.0358,1.0,0.0,2.0,72.0,1.0,11.0
max,1.0,14.0,83.5,13.5,0.90632,3099.505,1.0,1.0,6.0,540.0,1.0,46.0


In [None]:
import matplotlib.pyplot as plt

# matrice de correlation
# taille de la figure
plt.figure(figsize=(10,8))
#matrice de correlation
sns.heatmap(df.select_dtypes(include=['number']).corr(),annot=True)

NameError: name 'plt' is not defined

In [12]:
# 2. Encodage (Transformation des Yes/No en 1/0)
binary_cols = ['card', 'owner', 'selfemp']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

In [13]:
# 3. Séparation X et y
X = df.drop("expenditure", axis=1)
y = df["expenditure"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# 4. Création du Pipeline (Scaler + SVM)
# Le Pipeline est magique : il applique le Scaler automatiquement avant le SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR())
])

In [15]:
# 5. Optimisation (GridSearch) - Demandé dans la consigne
param_grid = {
    'svm__C': [1, 10, 100, 1000],          # Paramètres de régularisation
    'svm__kernel': ['rbf', 'linear'],      # Types de noyaux
    'svm__epsilon': [0.1, 0.2, 0.5]        # Marge d'erreur
}

print("Démarrage de l'optimisation...")
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Meilleurs paramètres : {grid.best_params_}")
print(f"Score R2 sur le test : {grid.score(X_test, y_test):.4f}")

Démarrage de l'optimisation...
Meilleurs paramètres : {'svm__C': 1000, 'svm__epsilon': 0.1, 'svm__kernel': 'rbf'}
Score R2 sur le test : 0.9150


In [16]:
# 6. Sauvegarde du modèle final
# On sauvegarde tout le pipeline (scaler inclus) dans un fichier .pkl
joblib.dump(grid.best_estimator_, 'model_svm.pkl')
print("Modèle sauvegardé sous 'model_svm.pkl' ✅")

Modèle sauvegardé sous 'model_svm.pkl' ✅
