In [None]:
import numpy as np
import pandas as pd
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold, RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier

In [None]:
churn_data=pd.read_csv('E:\mlmodel deploy\CHURNPREDICTION\churndata.csv')

In [None]:
churn_data.head()

In [None]:
churn_data.dtypes

In [None]:
pd.set_option("display.max_columns",None)

In [None]:
churn_data.head()

In [None]:
churn_data.info()

In [None]:
churn_data.notnull().sum()

In [None]:
churn_data=churn_data.drop(columns='customerID')

In [None]:
numerical_features=['MonthlyCharges','TotalCharges','tenure']

for col in churn_data.columns:
    if col not in numerical_features:
        print(col,churn_data[col].unique())

In [None]:
churn_data.dtypes

In [None]:
##churn_data['TotalCharges']=pd.to_numeric(churn_data['TotalCharges'],errors='coerce')

In [None]:
churn_data['TotalCharges'] = pd.to_numeric(churn_data['TotalCharges'].replace(' ', np.nan), errors='coerce')
churn_data['TotalCharges'] = churn_data['TotalCharges'].fillna(churn_data['MonthlyCharges'] * churn_data['tenure'])
churn_data['TotalCharges'] = churn_data['TotalCharges'].fillna(churn_data['TotalCharges'].median())


In [None]:
# churn_data['TotalCharges']=churn_data['TotalCharges'].replace(' ','0')
# churn_data['TotalCharges']=churn_data['TotalCharges'].astype('float')
# len(churn_data[churn_data['TotalCharges']==''])

In [None]:
len(churn_data[churn_data['TotalCharges']==''])

In [None]:
churn_data['TotalCharges'].unique()

In [None]:
churn_data.dtypes

In [None]:
print(churn_data['Churn'].value_counts())

EDA [EXPLORATORY DATA ANALYSIS]

In [None]:
churn_data.describe()

In [None]:
churn_data.head()

In [None]:
churn_data.tail()

In [None]:
def plot_histograph(df,column_name):
    plt.Figure(figsize=(5,5))
    sns.histplot(df[column_name], kde=True)
    plt.title(f"distribution of {column_name}")

    col_mean=df[column_name].mean()
    col_median=df[column_name].median()

    plt.axvline(col_mean, color='red',label='mean')
    plt.axvline(col_median, color='blue',label='median')

    plt.legend()

    plt.show()

In [None]:
plot_histograph(churn_data , 'MonthlyCharges')

In [None]:
plot_histograph(churn_data , 'tenure')

In [None]:
plot_histograph(churn_data , 'TotalCharges')


In [None]:
def box_plot(df,column_name):
    plt.figure(figsize=(5,5))
    sns.boxenplot(y=df[column_name])
    plt.title(f"box plot of {column_name}")
    plt.ylabel(column_name)
    plt.show

In [None]:
box_plot(churn_data,'tenure')

In [None]:
box_plot(churn_data,'TotalCharges')

In [None]:
sns.boxplot(churn_data['MonthlyCharges'])

In [None]:
sns.heatmap(churn_data[['tenure','MonthlyCharges','TotalCharges']].corr(),annot=True,cmap='coolwarm')
plt.show()

In [None]:
churn_data.columns

In [None]:
object_columns=churn_data.select_dtypes(include='object').columns.to_list()
object_columns=['SeniorCitizen'] + object_columns

In [None]:
object_columns

In [None]:
churn_data.dtypes

In [None]:
churn_data.head()

In [None]:
object_columns

In [None]:
for col in object_columns:
    plt.figure(figsize=(5, 4))
    sns.countplot(x=col, data=churn_data, palette="viridis", edgecolor="black")
    plt.title(f"Count of {col}", fontsize=12, fontweight='bold')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    

In [None]:
churn_data.shape

In [None]:
churn_data['Churn']=churn_data['Churn'].replace({'Yes':1,'No':0}) 

In [None]:
churn_data.head()

In [None]:
for cols in churn_data:
    print(cols ,churn_data[cols].unique())

In [None]:
churn_data['Contract'].unique()

In [None]:
object_columns

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

ordinal_map = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
churn_data['Contract_ord'] = churn_data['Contract'].map(ordinal_map)



In [None]:
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [
    'gender','Partner','Dependents','PhoneService','MultipleLines',
    'InternetService','OnlineSecurity','OnlineBackup','DeviceProtection',
    'TechSupport','StreamingTV','StreamingMovies','PaperlessBilling',
    'PaymentMethod'
]

In [None]:

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('ord', 'passthrough', ['Contract_ord'])
])

In [None]:
y=churn_data['Churn']

In [None]:
y

In [None]:
x=churn_data.drop(columns=['Churn'])

In [None]:
x

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.2,random_state=42)

In [None]:
x_train

In [None]:
y_train

In [None]:
y_train.value_counts()

SMOTE[synthetic minroing oversamling techniques]

In [None]:
from imblearn.over_sampling import SMOTENC
cat_indices = [x_train.columns.get_loc(c) for c in categorical_features + ['Contract_ord']]
smote = SMOTENC(categorical_features=cat_indices, random_state=42)


In [None]:
cat_indices

In [None]:
churn_data.shape

In [None]:

models={'decision tree':DecisionTreeClassifier(random_state=42),
        'random forest':RandomForestClassifier(random_state=42),
        'xgboost':XGBClassifier(random_state=42)}

In [None]:
cv_scores = {}

In [None]:
print(x_train.dtypes)

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline

for model_name, model in models.items():
    pipe = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('clf', model)
    ])
    scores = cross_val_score(pipe, x_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    cv_scores[model_name] = np.mean(scores)
    print(f"{model_name} - Mean ROC-AUC: {np.mean(scores):.4f}")

In [None]:
best_model_name = max(cv_scores, key=cv_scores.get)
best_model = models[best_model_name]
print(f"\n Best baseline model: {best_model_name}")

In [None]:
if best_model_name == 'random forest':
    param_dist = {
        'clf__n_estimators': [100, 200, 400],
        'clf__max_depth': [6, 10, 20, None],
        'clf__min_samples_leaf': [1, 2, 4]
    }
elif best_model_name == 'decision tree':
    param_dist = {
        'clf__max_depth': [5, 10, 20, None],
        'clf__min_samples_leaf': [1, 2, 4]
    }
else:  # xgboost
    param_dist = {
        'clf__n_estimators': [100, 200, 400],
        'clf__max_depth': [3, 6, 10],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__subsample': [0.8, 1.0]
    }


In [None]:
pipe = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('clf', best_model)
])

search = RandomizedSearchCV(pipe, param_distributions=param_dist,
                            n_iter=10, scoring='roc_auc', cv=5,
                            n_jobs=-1, random_state=42, verbose=2)
search.fit(x_train, y_train)

In [None]:
cv_scores

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

y_pred = search.predict(x_test)
y_proba = search.predict_proba(x_test)[:, 1]

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC :", average_precision_score(y_test, y_proba))

In [None]:
import shap, matplotlib.pyplot as plt


best_pipe = search.best_estimator_
final_model = best_pipe.named_steps['clf']
X_trans = best_pipe.named_steps['preprocessor'].transform(x_test)

explainer = shap.Explainer(final_model.predict, X_trans)
shap_values = explainer(X_trans[:200])

plt.title("SHAP Feature Importance")
shap.summary_plot(shap_values, show=False)
plt.show()

shap.summary_plot(shap_values, max_display=10, show=False)
plt.title("Top 10 Feature Importances (SHAP)")
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, auc

# Get precision, recall, and thresholds from your model
prec, rec, thr = precision_recall_curve(y_test, y_proba)

# Compute the F1-score for each threshold
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-8)

# Find the threshold that gives the best F1
best_idx = np.argmax(f1_scores)
best_threshold = thr[best_idx]

print(f"\nðŸ”¹ Best Threshold (by F1): {best_threshold:.3f}")
print(f"Precision: {prec[best_idx]:.3f} | Recall: {rec[best_idx]:.3f} | F1: {f1_scores[best_idx]:.3f}")

# Plot Precisionâ€“Recall curve
plt.figure(figsize=(7,5))
plt.plot(rec, prec, color="blue", label=f"PR Curve (AUC = {auc(rec, prec):.3f})")
plt.scatter(rec[best_idx], prec[best_idx], color="red", s=80, label=f"Best threshold = {best_threshold:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precisionâ€“Recall Curve")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Apply new threshold to probabilities
y_pred_optimal = (y_proba >= best_threshold).astype(int)

# Re-evaluate metrics
print("\n=== Classification Report (Optimal Threshold) ===")
print(classification_report(y_test, y_pred_optimal, digits=3))

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC :", average_precision_score(y_test, y_proba))

# Confusion Matrix
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_optimal, cmap='Blues', values_format='d')
plt.title("Confusion Matrix at Optimal Threshold")
plt.show()


PICKEL MODEL


In [None]:
import pickle

In [None]:
import pickle

with open("churn_prediction_model.pkl", "wb") as f:
    pickle.dump(search.best_estimator_, f)

with open("churn_prediction_model.pkl", "rb") as f:
    model = pickle.load(f)


In [None]:
ordinal_map = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}

new_customer = pd.DataFrame([{
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 12,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'Contract_ord': ordinal_map['Month-to-month'],   # ðŸ‘ˆ ADD THIS LINE
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 75.3,
    'TotalCharges': 900.5
}])



In [None]:
# Create the ordinal mapping again
ordinal_map = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}

# Add the column (your model expects this)
new_customer['Contract_ord'] = new_customer['Contract'].map(ordinal_map)


In [None]:
prediction=model.predict(new_customer)[0]


In [None]:
probability=model.predict_proba(new_customer)[0][1]


In [None]:

print("Prediction:", "Yes" if prediction == 1 else "No")
print("Probability:", round(probability, 3))
