In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
# Convert TotalCharges to numeric, coercing errors to NaN (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing values for simplicity
df.dropna(inplace=True)

# Convert target variable 'Churn' to a binary format (0 or 1)
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Define features (X) and target (y)
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

In [5]:
# Use pandas to get dummy variables (one-hot encoding)
X_encoded = pd.get_dummies(X, drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train the model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.7875


In [13]:
import joblib

# Retrain our one-hot encoder and model on ALL the data
# This is because we want our final "production" model to learn from every example we have
final_X_encoded = pd.get_dummies(X, drop_first=True)
final_model = LogisticRegression(max_iter=5000)
final_model.fit(final_X_encoded, y)

# Save the model object
joblib.dump(final_model, 'churn_model.joblib')

# We also need to save the columns in the exact order they were trained on
# This is CRITICAL for when we receive new data for prediction
model_columns = list(final_X_encoded.columns)
joblib.dump(model_columns, 'model_columns.joblib')

print("Model and columns saved to disk.")

Model and columns saved to disk.
