# Importing Libraries

In [2]:
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations (optional)
from sklearn.model_selection import train_test_split  # Data splitting
from sklearn.preprocessing import StandardScaler  # Feature scaling
from sklearn.linear_model import LogisticRegression  # Logistic regression model
from sklearn.metrics import accuracy_score  # Evaluation metric
from sklearn.utils import resample  # Upsampling/Downsampling


# Load the Dataset


In [5]:
churnData = pd.read_csv('/Customer-Churn.csv')
churnData.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [6]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [7]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [9]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean())
churnData.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
OnlineSecurity,0
OnlineBackup,0
DeviceProtection,0
TechSupport,0


In [10]:
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
X = churnData[features]
y = churnData['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


# Model

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7842441447835344

In [14]:
class_distribution = y.value_counts(normalize=True) * 100
class_distribution

Unnamed: 0_level_0,proportion
Churn,Unnamed: 1_level_1
0,73.463013
1,26.536987


# imbalance

In [15]:
from sklearn.utils import resample

# Combine features and target
data_combined = pd.concat([pd.DataFrame(X_scaled, columns=features), y.reset_index(drop=True)], axis=1)

# Separate majority and minority classes
majority = data_combined[data_combined['Churn'] == 0]
minority = data_combined[data_combined['Churn'] == 1]

# Upsample minority class
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)

# Combine and separate
data_upsampled = pd.concat([majority, minority_upsampled])
X_upsampled = data_upsampled[features]
y_upsampled = data_upsampled['Churn']


In [16]:
# Downsample majority class
majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)

# Combine and separate
data_downsampled = pd.concat([majority_downsampled, minority])
X_downsampled = data_downsampled[features]
y_downsampled = data_downsampled['Churn']


In [17]:
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
logreg_up = LogisticRegression(random_state=42)
logreg_up.fit(X_train_up, y_train_up)

y_pred_up = logreg_up.predict(X_test_up)
accuracy_up = accuracy_score(y_test_up, y_pred_up)
accuracy_up


0.7323671497584541

In [18]:
X_train_down, X_test_down, y_train_down, y_test_down = train_test_split(X_downsampled, y_downsampled, test_size=0.2, random_state=42)
logreg_down = LogisticRegression(random_state=42)
logreg_down.fit(X_train_down, y_train_down)

y_pred_down = logreg_down.predict(X_test_down)
accuracy_down = accuracy_score(y_test_down, y_pred_down)
accuracy_down


0.7553475935828877

In [21]:
# Train and evaluate on upsampled data
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
logreg_up = LogisticRegression(random_state=42)
logreg_up.fit(X_train_up, y_train_up)
y_pred_up = logreg_up.predict(X_test_up)
accuracy_up = accuracy_score(y_test_up, y_pred_up)

# Train and evaluate on downsampled data
X_train_down, X_test_down, y_train_down, y_test_down = train_test_split(X_downsampled, y_downsampled, test_size=0.2, random_state=42)
logreg_down = LogisticRegression(random_state=42)
logreg_down.fit(X_train_down, y_train_down)
y_pred_down = logreg_down.predict(X_test_down)
accuracy_down = accuracy_score(y_test_down, y_pred_down)

accuracy_up, accuracy_down


(0.7323671497584541, 0.7553475935828877)

The balanced datasets provide more reliable results, even though the accuracy is slightly lower.
