In [22]:
import numpy as np
import pandas as pd

In [23]:
df=pd.read_csv('gym_churn_us.csv')

In [24]:
df.columns

Index(['gender', 'Near_Location', 'Partner', 'Promo_friends', 'Phone',
       'Contract_period', 'Group_visits', 'Age',
       'Avg_additional_charges_total', 'Month_to_end_contract', 'Lifetime',
       'Avg_class_frequency_total', 'Avg_class_frequency_current_month',
       'Churn'],
      dtype='object')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   gender                             4000 non-null   int64  
 1   Near_Location                      4000 non-null   int64  
 2   Partner                            4000 non-null   int64  
 3   Promo_friends                      4000 non-null   int64  
 4   Phone                              4000 non-null   int64  
 5   Contract_period                    4000 non-null   int64  
 6   Group_visits                       4000 non-null   int64  
 7   Age                                4000 non-null   int64  
 8   Avg_additional_charges_total       4000 non-null   float64
 9   Month_to_end_contract              4000 non-null   float64
 10  Lifetime                           4000 non-null   int64  
 11  Avg_class_frequency_total          4000 non-null   float

In [26]:
df.head()

Unnamed: 0,gender,Near_Location,Partner,Promo_friends,Phone,Contract_period,Group_visits,Age,Avg_additional_charges_total,Month_to_end_contract,Lifetime,Avg_class_frequency_total,Avg_class_frequency_current_month,Churn
0,1,1,1,1,0,6,1,29,14.22747,5.0,3,0.020398,0.0,0
1,0,1,0,0,1,12,1,31,113.202938,12.0,7,1.922936,1.910244,0
2,0,1,1,0,1,1,0,28,129.448479,1.0,2,1.859098,1.736502,0
3,0,1,1,1,1,12,1,33,62.669863,12.0,2,3.205633,3.357215,0
4,1,1,1,1,1,1,0,26,198.362265,1.0,3,1.113884,1.120078,0


In [27]:
df = df.drop_duplicates()

In [28]:
print(df.isnull().sum())  

gender                               0
Near_Location                        0
Partner                              0
Promo_friends                        0
Phone                                0
Contract_period                      0
Group_visits                         0
Age                                  0
Avg_additional_charges_total         0
Month_to_end_contract                0
Lifetime                             0
Avg_class_frequency_total            0
Avg_class_frequency_current_month    0
Churn                                0
dtype: int64


In [29]:
selected_features = [
    "Contract_period",
    "Age",
    "Lifetime",
    "Avg_class_frequency_current_month",
    "Avg_class_frequency_total",
    "Avg_additional_charges_total",
    "Group_visits",
    "Promo_friends"
]

X = df[selected_features]
y = df["Churn"]


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
pip install xgboost




In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

In [33]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(random_state=42, eval_metric="logloss")
}

In [34]:
results = []

for name, model in models.items():

    # Scaling only where required
    if name in ["Logistic Regression", "SVM", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append([name, acc, f1])

In [36]:
results_df = pd.DataFrame(
    results, columns=["Model", "Accuracy", "F1-Score"]
)

results_df = results_df.sort_values(
    by="F1-Score", ascending=False
)

results_df

Unnamed: 0,Model,Accuracy,F1-Score
3,Gradient Boosting,0.94125,0.886199
6,XGBoost,0.93875,0.882494
4,SVM,0.935,0.872549
2,Random Forest,0.92625,0.857831
0,Logistic Regression,0.92375,0.8523
5,KNN,0.89625,0.79198
1,Decision Tree,0.89,0.789474


In [37]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBClassifier(
    random_state=42,
    eval_metric="logloss"
)

param_dist = {
    "n_estimators": [100, 200],
    "max_depth": [3, 4],
    "learning_rate": [0.01, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "gamma": [0, 0.1],
    "min_child_weight": [1, 3]
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=5,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

In [38]:
best_xgb = random_search.best_estimator_

y_pred = best_xgb.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Best Parameters:", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Parameters: {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 1.0}
Accuracy: 0.94
F1-Score: 0.8840579710144928
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       588
           1       0.91      0.86      0.88       212

    accuracy                           0.94       800
   macro avg       0.93      0.92      0.92       800
weighted avg       0.94      0.94      0.94       800



In [20]:
import joblib

joblib.dump(best_xgb, "final_xgboost_churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']