In [1]:
import pandas as pd

In [3]:
df = pd.read_excel('/content/Telco_customer_churn.xlsx')

In [4]:
df.head(2)

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved


In [5]:
columns = [
    'Total Charges',
    'Tenure Months',
    'Monthly Charges',
    'Contract',
    'Payment Method',
    'Online Security',
    'Tech Support',
    'Internet Service',
    'Dependents',
    'Churn Value'
]

df = df[columns]

In [6]:
df.columns

Index(['Total Charges', 'Tenure Months', 'Monthly Charges', 'Contract',
       'Payment Method', 'Online Security', 'Tech Support', 'Internet Service',
       'Dependents', 'Churn Value'],
      dtype='object')

In [8]:
df.isnull().sum()

Unnamed: 0,0
Total Charges,0
Tenure Months,0
Monthly Charges,0
Contract,0
Payment Method,0
Online Security,0
Tech Support,0
Internet Service,0
Dependents,0
Churn Value,0


In [46]:
df.sample(5)

Unnamed: 0,Total Charges,Tenure Months,Monthly Charges,Contract,Payment Method,Online Security,Tech Support,Internet Service,Dependents,Churn Value
1335,673.1,7,94.7,Month-to-month,Credit card (automatic),Yes,No,Fiber optic,No,1
3590,274.35,5,55.8,Month-to-month,Mailed check,Yes,No,DSL,No,0
4153,4310.35,71,61.4,Two year,Credit card (automatic),No,Yes,DSL,No,0
4592,235.0,3,69.15,Month-to-month,Bank transfer (automatic),No,No,DSL,Yes,0
1670,655.85,30,19.65,Month-to-month,Bank transfer (automatic),No internet service,No internet service,No,No,1


In [7]:
X = df.drop(columns=['Churn Value'])
y = df['Churn Value']

In [20]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Ensure 'Total Charges' is in numerical_cols if it exists and remove from categorical_cols
if 'Total Charges' in categorical_cols:
    categorical_cols.remove('Total Charges')
if 'Total Charges' not in numerical_cols:
    numerical_cols.append('Total Charges')

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

Numerical columns: ['Tenure Months', 'Monthly Charges', 'Total Charges']
Categorical columns: ['Contract', 'Payment Method', 'Online Security', 'Tech Support', 'Internet Service', 'Dependents']


In [29]:
import numpy as np

In [30]:
# Strip spaces and convert
X[numerical_cols] = X[numerical_cols].replace(' ', np.nan)  # replace blanks with NaN
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')


In [33]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [34]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # handles unexpected NaN
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [35]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [36]:
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__penalty': ['l2'],
    'logreg__solver': ['lbfgs', 'liblinear']  # both support l2
}

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [38]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [39]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs'}
Best CV Score: 0.8051090542016481


In [40]:
test_score = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_score)

Test Accuracy: 0.801277501774308


In [41]:
import joblib

In [42]:
joblib.dump(grid_search.best_estimator_, "churn_pipeline.pkl")
print("Pipeline saved as churn_pipeline.pkl")

Pipeline saved as churn_pipeline.pkl


In [43]:
pipeline = joblib.load("churn_pipeline.pkl")

In [44]:
new_data = pd.DataFrame([{
    'Total Charges': 100.5,
    'Tenure Months': 12,
    'Monthly Charges': 45.0,
    'Contract': 'Month-to-month',
    'Payment Method': 'Electronic check',
    'Online Security': 'No',
    'Tech Support': 'No',
    'Internet Service': 'DSL',
    'Dependents': 'No'
}])

# Ensure Total Charges is numeric in new_data and handle potential non-numeric entries
new_data['Total Charges'] = pd.to_numeric(new_data['Total Charges'], errors='coerce')
new_data['Total Charges'].fillna(new_data['Total Charges'].median(), inplace=True) # Fill potential NaNs

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['Total Charges'].fillna(new_data['Total Charges'].median(), inplace=True) # Fill potential NaNs


In [45]:
prediction = pipeline.predict(new_data)
print("Prediction:", prediction)

Prediction: [0]
