In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Atelier SVM: 

Nous devons faire une étude des données churn.

Les données importés depuis un fichier CSV contiennent 13 variables explicatives (inputs) et une variable cible (output) (Exited).

L'objectif majeur de cette étude est d'assister l'entreprise en construisant un modèle décisif basé sur un Support Vector Machine (SVM). 

Nous chercherons à élaborer un modèle SVM performant, capable de prédire avec précision si un client est enclin au churn ou non, afin d'aider l'entreprise à prendre des mesures proactives.

# Step I: Upload Data

In [5]:
data = pd.read_csv('churn_problem.csv')


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [7]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


# Step II- Data Preparation

In [8]:
# Supprimer les colonnes insignificatives 
data1 = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data1.shape

(10000, 11)

In [9]:
# Transformer les données catégorielles
data1 = pd.get_dummies(data1, columns=['Geography', 'Gender'], drop_first=True)


In [10]:
# Gestion des valeurs manquantes
data1 = data1.dropna()
data1.head


<bound method NDFrame.head of       CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0             619   42       2       0.00              1          1   
1             608   41       1   83807.86              1          0   
2             502   42       8  159660.80              3          1   
3             699   39       1       0.00              2          0   
4             850   43       2  125510.82              1          1   
...           ...  ...     ...        ...            ...        ...   
9995          771   39       5       0.00              2          1   
9996          516   35      10   57369.61              1          1   
9997          709   36       7       0.00              1          0   
9998          772   42       3   75075.31              2          1   
9999          792   28       4  130142.79              1          1   

      IsActiveMember  EstimatedSalary  Exited  Geography_Germany  \
0                  1        101348.88       1    

# Step III: Data Modelisation

In [11]:
# Division des données en variables inputs (X) et variable de sortie Y
X = data1.drop('Exited', axis=1)
y = data1['Exited']

In [12]:
from sklearn.model_selection import train_test_split
# Division des données en un training set et testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.preprocessing import StandardScaler
#Standarization des données
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step VI: SVM & ROC Curve

In [None]:
#Application de l'SVM avec les différents Kernels

from sklearn.svm import SVC
#  SVM with Linear Kernel
svm_linear = SVC(kernel='linear', probability=True)
svm_linear.fit(X_train, y_train)

# SVM with Polynomial Kernel
svm_poly = SVC(kernel='poly', degree=3, probability=True)
svm_poly.fit(X_train, y_train)

# SVM with RBF Kernel
svm_rbf = SVC(kernel='rbf', probability=True)
svm_rbf.fit(X_train, y_train)

# SVM with Sigmoid Kernel
svm_sigmoid = SVC(kernel='sigmoid', probability=True)
svm_sigmoid.fit(X_train, y_train)


In [None]:
#Evaluation des modèles avec le ROC curve pour chaque modèle
from sklearn.metrics import roc_curve, auc

# Fonction de la méthode d'évaluation ROC curve
def plot_roc_curve(y_true, y_prob, label):
    fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

# Predire les probabilités pour chaque modèle
y_prob_linear = svm_linear.predict_proba(X_test)
y_prob_poly = svm_poly.predict_proba(X_test)
y_prob_rbf = svm_rbf.predict_proba(X_test)
y_prob_sigmoid = svm_sigmoid.predict_proba(X_test)  # Add prediction for Sigmoid SVM

# Modéliser les graphes pour chaque modèle
plot_roc_curve(y_test, y_prob_linear, 'Linear SVM')
plot_roc_curve(y_test, y_prob_poly, 'Poly SVM')
plot_roc_curve(y_test, y_prob_rbf, 'RBF SVM')
plot_roc_curve(y_test, y_prob_sigmoid, 'Sigmoid SVM')  # Add ROC curve for Sigmoid SVM

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guessing')

# Personalisation du plot
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()