In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

import os


In [2]:
data = pd.read_csv('churn_predictor.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [33]:
#fonction de chargement des donn√©es

def load_data(file_path='churn_predictor_lowercase.csv'):
    """Charge le dataset de churn"""
    try:
        df = pd.read_csv(file_path)
    
        print(f"‚úÖ Dataset charg√©: {df.shape}")
        print(f"\nColonnes: {df.columns.tolist()}")
        return df
    except FileNotFoundError:
        print(f"‚ùå Fichier {file_path} non trouv√©!")
        return None
    
# Charger les donn√©es
data = load_data()  

‚úÖ Dataset charg√©: (7043, 21)

Colonnes: ['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents', 'tenure', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'monthlycharges', 'totalcharges', 'churn']


In [29]:
# Charger votre CSV
df = pd.read_csv('churn_predictor.csv')

# Afficher les colonnes
print("Colonnes actuelles:")
print(df.columns.tolist())

# Les convertir en minuscules
df.columns = df.columns.str.lower()

# Sauvegarder
df.to_csv('churn_predictor_lowercase.csv', index=False)
print("\n‚úÖ Fichier sauvegard√© avec colonnes en minuscules")

Colonnes actuelles:
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

‚úÖ Fichier sauvegard√© avec colonnes en minuscules


In [34]:

#fonction de pr√©traitement des donn√©es
def preprocess_data(df, is_training=True, encoders=None, scaler=None):
    """Pr√©traite les donn√©es pour le mod√®le"""
    df = df.copy()
    
    # Forcer toutes les colonnes en minuscules pour coh√©rence
    df.columns = df.columns.str.lower()
    
    # Supprimer customerID si pr√©sent (non utile pour la pr√©diction)
    if 'customerid' in df.columns:
        df = df.drop('customerid', axis=1)
    
    # S√©parer features et target
    if 'churn' in df.columns:
        X = df.drop('churn', axis=1)
        y = df['churn']
    else:
        X = df
        y = None
    
    # Identifier colonnes num√©riques et cat√©gorielles
    numeric_features = ['tenure']
    categorical_features = ['gender', 'seniorcitizen', 'partner', 'dependents', 
                           'phoneservice', 'multiplelines', 'internetservice', 
                           'onlinesecurity', 'onlinebackup', 'deviceprotection',
                           'techsupport', 'streamingtv', 'streamingmovies',
                           'contract', 'paperlessbilling', 'paymentmethod']
    
    # Filtrer les colonnes qui existent r√©ellement
    numeric_features = [col for col in numeric_features if col in X.columns]
    categorical_features = [col for col in categorical_features if col in X.columns]
    
    # Ajouter TotalCharges et MonthlyCharges s'ils existent
    for col in ['totalcharges', 'monthlycharges']:
        if col in X.columns and col not in numeric_features:
            numeric_features.append(col)
    
    print(f"\nüìä Features num√©riques: {numeric_features}")
    print(f"üìä Features cat√©gorielles: {categorical_features}")
    
    if is_training:
        # Encoder les variables cat√©gorielles
        encoders = {}
        for col in categorical_features:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            encoders[col] = le
        
        # G√©rer TotalCharges si c'est une string
        if 'totalcharges' in X.columns:
            X['totalcharges'] = pd.to_numeric(X['totalcharges'], errors='coerce')
            X['totalcharges'].fillna(X['totalcharges'].median(), inplace=True)
        
        # Normaliser les variables num√©riques
        scaler = StandardScaler()
        if numeric_features:
            X[numeric_features] = scaler.fit_transform(X[numeric_features])
        
        return X, y, encoders, scaler
    else:
        # Utiliser les encoders et scaler existants
        for col in categorical_features:
            if col in encoders:
                # G√©rer les nouvelles cat√©gories
                try:
                    X[col] = encoders[col].transform(X[col].astype(str))
                except ValueError:
                    # Si nouvelle cat√©gorie, utiliser la cat√©gorie la plus fr√©quente
                    X[col] = 0
        
        # G√©rer TotalCharges si c'est une string
        if 'totalcharges' in X.columns:
            X['totalcharges'] = pd.to_numeric(X['totalcharges'], errors='coerce')
            X['totalcharges'].fillna(0, inplace=True)
        
        if numeric_features:
            X[numeric_features] = scaler.transform(X[numeric_features])
        
        return X
preprocess_data(data)


üìä Features num√©riques: ['tenure', 'totalcharges', 'monthlycharges']
üìä Features cat√©gorielles: ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


(      gender  seniorcitizen  partner  dependents    tenure  phoneservice  \
 0          0              0        1           0 -1.277445             0   
 1          1              0        0           0  0.066327             1   
 2          1              0        0           0 -1.236724             1   
 3          1              0        0           0  0.514251             0   
 4          0              0        0           0 -1.236724             1   
 ...      ...            ...      ...         ...       ...           ...   
 7038       1              0        1           1 -0.340876             1   
 7039       0              0        1           1  1.613701             1   
 7040       0              0        1           1 -0.870241             0   
 7041       1              1        1           0 -1.155283             1   
 7042       1              0        0           0  1.369379             1   
 
       multiplelines  internetservice  onlinesecurity  onlinebackup  \
 0 

In [35]:
def train_model(file_path='churn_predictor.csv'):
    """Entra√Æne le mod√®le de pr√©diction de churn"""
    print("=" * 60)
    print("üöÄ ENTRA√éNEMENT DU MOD√àLE DE PR√âDICTION DE CHURN")
    print("=" * 60)
    
    # Charger les donn√©es
    df = load_data(file_path)
    if df is None:
        return None, None, None, None
    
    print(f"\nüìà Distribution du Churn:")
    if 'churn' in df.columns:
        print(df['churn'].value_counts())
        print(f"\nTaux de churn: {df['churn'].value_counts(normalize=True)[1]*100:.2f}%")
    
    # Afficher des statistiques
    print(f"\nüìä Statistiques descriptives:")
    if 'tenure' in df.columns:
        print(f"Tenure moyenne: {df['tenure'].mean():.2f} mois")
    if 'monthlycharges' in df.columns:
        print(f"Charges mensuelles moyennes: ${df['monthlycharges'].mean():.2f}")
    
    # Pr√©traitement
    print("\nüîß Pr√©traitement des donn√©es...")
    X, y, encoders, scaler = preprocess_data(df, is_training=True)
    
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"\n‚úÇÔ∏è Split des donn√©es:")
    print(f"   Training set: {X_train.shape}")
    print(f"   Test set: {X_test.shape}")
    
    # Entra√Æner le mod√®le
    print("\nü§ñ Entra√Ænement du Random Forest...")
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    model.fit(X_train, y_train)
    
    # √âvaluation
    print("\nüìà √âVALUATION DU MOD√àLE")
    print("=" * 60)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nüéØ Accuracy: {accuracy:.4f}")
    
    # Classification Report
    print("\nüìä Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))
    
    # ROC-AUC Score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"\nüéØ ROC-AUC Score: {roc_auc:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nüî¢ Confusion Matrix:")
    print(f"   True Negatives: {cm[0][0]}")
    print(f"   False Positives: {cm[0][1]}")
    print(f"   False Negatives: {cm[1][0]}")
    print(f"   True Positives: {cm[1][1]}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nüîù Top 10 Features les plus importantes:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Sauvegarder le mod√®le et les objets de pr√©traitement
    print("\nüíæ Sauvegarde du mod√®le et des pr√©processeurs...")
    joblib.dump(model, 'churn_model.pkl')
    joblib.dump(encoders, 'encoders.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(X.columns.tolist(), 'feature_names.pkl')
    
    print("\n‚úÖ Mod√®le sauvegard√© avec succ√®s!")
    print("=" * 60)
    
    return model, encoders, scaler, X.columns.tolist()


model, encoders, scaler, features = train_model()

print(f"\n Noms des colonnes sauvegard√©s : {features}")


üöÄ ENTRA√éNEMENT DU MOD√àLE DE PR√âDICTION DE CHURN
‚úÖ Dataset charg√©: (7043, 21)

Colonnes: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

üìà Distribution du Churn:

üìä Statistiques descriptives:
Tenure moyenne: 32.37 mois

üîß Pr√©traitement des donn√©es...

üìä Features num√©riques: ['tenure', 'totalcharges', 'monthlycharges']
üìä Features cat√©gorielles: ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']

‚úÇÔ∏è Split des donn√©es:
   Training set: (5634, 19)
   Test set: (1409, 19)

ü§ñ Entra√Æ