In [4]:
# Import necessary libraries
import pandas as pd
import pickle
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from feature_engineering import apply_feature_engineering  # Import des fonctions de feature engineering

# Load dataset
df = pd.read_csv('2767ML_assignment1_data.csv')

# Define categorical and numerical columns
categorical_cols = ['State', 'International plan', 'Voice mail plan']
numerical_cols = ['Account length', 'Number vmail messages', 'Customer service calls', 
                  'MonthlyCharges', 'TotalMinutes', 'TotalCalls',
                  'DayMinutesPct', 'EveMinutesPct', 'NightMinutesPct', 'IntlMinutesPct',
                  'DayChargesPct', 'EveChargesPct', 'NightChargesPct', 'IntlChargesPct',
                  'DayCallsPct', 'EveCallsPct', 'NightCallsPct', 'IntlCallsPct']

# Separate features (X) and target (y)
X = df.drop(columns=['Churn'])  # Features
y = df['Churn']  # Target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply FunctionTransformer to apply the feature engineering function
feature_transformer = FunctionTransformer(apply_feature_engineering)

# Create a ColumnTransformer to apply different preprocessing to different columns
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        # Scale numerical features
        ('num', StandardScaler(), numerical_cols),
        # One-hot encode categorical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Créer un pipeline unique qui inclut le feature engineering, le prétraitement et le modèle XGBoost
full_pipeline = Pipeline([
    ('feature_engineering', feature_transformer),  # Custom feature engineering
    ('preprocessor', preprocessing_pipeline),  # Prétraitement (scaling + encoding)
    ('classifier', XGBClassifier(random_state=42))  # Modèle XGBoost
])

# Entraîner le pipeline unique sur les données d'entraînement
full_pipeline.fit(X_train, y_train)

# Sauvegarder le pipeline unique dans un fichier pickle
with open('full_pipeline.pkl', 'wb') as f:
    pickle.dump(full_pipeline, f)

# Appliquer le pipeline unique aux données de test pour évaluer le modèle
y_pred = full_pipeline.predict(X_test)

# Afficher les métriques d'évaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

       False       0.97      1.00      0.98       512
        True       0.97      0.83      0.90        88

    accuracy                           0.97       600
   macro avg       0.97      0.91      0.94       600
weighted avg       0.97      0.97      0.97       600

Accuracy: 0.9716666666666667


In [5]:
!pip freeze > requirements.txt

In [6]:
import pandas as pd
import pickle
from feature_engineering import apply_feature_engineering

# Charger les nouvelles données
new_data = pd.read_excel("2767ML_assignment1_externalvalidation_data_toStudents.xls")

# Charger le pipeline unique (prétraitement + modèle XGBoost)
with open('full_pipeline.pkl', 'rb') as f:
    full_pipeline = pickle.load(f)

# Faire des prédictions avec le pipeline unique
predictions = full_pipeline.predict(new_data)

# Obtenir les probabilités prédites
probabilities = full_pipeline.predict_proba(new_data)[:, 1]  # Probabilités pour la classe 1

# Afficher les prédictions
print("Churn Predictions:", predictions)
print("Churn Probabilities:", probabilities)

# Apply different thresholds
thresholds = [0.99, 0.5, 0.3]
for threshold in thresholds:
    predictions = (probabilities > threshold).astype(int)
    print(f"Churn Predictions (threshold = {threshold}): ", predictions)

Churn Predictions: [1 0 0]
Churn Probabilities: [0.9996063  0.9367366  0.00161879]
Churn Predictions (threshold = 0.99):  [1 0 0]
Churn Predictions (threshold = 0.5):  [1 1 0]
Churn Predictions (threshold = 0.3):  [1 1 0]
