In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

: 

# *IMPORTING DATASET AND BASIC ANALYSIS*

In [None]:
data=pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.dtypes

Count distribution of the values in each categorical column

In [None]:
for column in data.columns:
    if data[column].dtype==object and column !='customerID':
        print(data[column].value_counts(normalize=True))
        print()
        

# Handling missing data

In [None]:
data["TotalCharges"]=pd.to_numeric(data["TotalCharges"],errors='coerce')
data["TotalCharges"]

In [None]:
data['SeniorCitizen'].value_counts()

In [None]:
data['SeniorCitizen'] =data['SeniorCitizen'] .astype(str)

In [None]:
data.isna().sum()

In [None]:
data["TotalCharges"]=data["TotalCharges"].fillna(data["TotalCharges"].median())
data.isna().sum()

# Check class balance

In [None]:
#Check class balance

yes=data['Churn'].value_counts(normalize=True)
yes

# Univariate Analysis and plots

In [None]:
#univariate analysis
sns.histplot(data=data,x='MonthlyCharges',hue='Churn',kde=True,bins=40,palette='Set1')

In [None]:
sns.histplot(data=data,x='tenure',hue='Churn',kde=True,bins=40,palette='Set3')

In [None]:
sns.histplot(data=data,x='TotalCharges',hue='Churn',kde=True,bins=50,palette='Set2')

# Bivariate Analysis 

**Categorical vs Target**

In [None]:
pd.crosstab(data['gender'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['Dependents'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['SeniorCitizen'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['Partner'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['PhoneService'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['DeviceProtection'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['OnlineBackup'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['PaperlessBilling'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['TechSupport'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['Contract'],data['Churn'],normalize=True)

In [None]:
pd.crosstab(data['PaymentMethod'],data['Churn'],normalize=True)

In [None]:
columns=list(data.select_dtypes(include=np.number).columns).copy()
columns.append('Churn')
sns.pairplot(data[columns],hue='Churn')

**Numeric vs Target**

In [None]:
data.groupby('Churn')['MonthlyCharges'].mean()


In [None]:
data.groupby('Churn')['tenure'].mean()

In [None]:
data.groupby('Churn')['TotalCharges'].mean()

**Categorical vs Numeric**

In [None]:
pd.crosstab(
    data['Churn'],
    data['PaperlessBilling'],
    values=data['TotalCharges'],
    aggfunc='mean',
)

In [None]:
pd.crosstab(
    data['Churn'],
    data['MultipleLines'],
    values=data['TotalCharges'],
    aggfunc='mean',
)

In [None]:
pd.crosstab(
    data['Churn'],
    data['StreamingMovies'],
    values=data['TotalCharges'],
    aggfunc='mean',
)

**Target and Feature splitting**

In [None]:
X=data.drop("Churn",axis=1)
y=data["Churn"]

**Train Test Splitting**

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.35,random_state=12)

**Dropping customerID column because it is by default true that customer ids arent dependent on final churn**

In [None]:
X_train.drop("customerID",axis=1,inplace=True)

# Split up numeric and categoric data

In [None]:
numeric_cols=X_train.select_dtypes(include=np.number).columns
non_numeric_cols=X_train.select_dtypes(include=object).columns

cat_data=X_train[non_numeric_cols]
num_data=X_train[numeric_cols]

**Check for skewness and Kurtosis**

In [None]:
num_data.skew()
#Transofrmation needed for heavy skewed more than 1 or less than -1 
#No transformation needed

In [None]:
num_data.kurt()
# more than 3 or leptokurtic needs transformation
#no transformation for platykurtic or kurt<0.

In [None]:
#outlier check
sns.boxplot(num_data)
#few outliers are there in totalcharges 

In [None]:
lower_limit = num_data['TotalCharges'].quantile(0.05)
upper_limit = num_data['TotalCharges'].quantile(0.95)

# Apply capping
num_data['TotalCharges'] = num_data['TotalCharges'].clip(lower=lower_limit, upper=upper_limit)
X_test['TotalCharges']=X_test['TotalCharges'].clip(lower=lower_limit, upper=upper_limit)

In [None]:
#check after capping
sns.boxplot(num_data['TotalCharges'])

# Cardinality Analysis

In [None]:
#Cardinality Analysis for categorical columns
cardinal_vals=pd.DataFrame(columns=['column_name','unique_count_entries'])
for col in cat_data.columns:
    cardinal_vals=pd.concat([cardinal_vals,pd.DataFrame([{'column_name': col, 'unique_count_entries': cat_data[col].nunique()}])])
cardinal_vals
#few unique values so one hot for all

# Encoding categorical variables
**Label Encoder for target variable**

**One Hot Encoder for categorical variable**

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe=OneHotEncoder(sparse_output=False)
cat_data_encoded=ohe.fit_transform(cat_data)
encoded_cols=ohe.get_feature_names_out(cat_data.columns)
cat_data_encoded=pd.DataFrame(cat_data_encoded,columns=encoded_cols,index=cat_data.index)
cat_data_encoded

In [None]:
from sklearn.preprocessing import LabelEncoder

label=LabelEncoder()
Y_train=label.fit_transform(Y_train)

# Feature Selection

**Chi 2 test for categorical**

**Pearson Correlation for numeric**

In [None]:
from sklearn.feature_selection import chi2

Chi_scores,pvalues=chi2(cat_data_encoded,Y_train)
pvalues=pd.DataFrame(pvalues,index=cat_data_encoded.columns,columns=['pvalue'])
pvalues.sort_values(by='pvalue',ascending=False)

In [None]:
imp_cat=set()
for feature,row in pvalues.iterrows():
    if row['pvalue']<0.05:
        imp_cat.add(feature)
imp_cat
non_imp_cat=set(cat_data_encoded.columns).symmetric_difference(imp_cat)
non_imp_cat

**Dropping unnecessary columns**

In [None]:
cat_data_encoded.drop(list(non_imp_cat),axis=1,inplace=True)

**Correlation Matrix**

In [None]:
corr=num_data.corr()
corr

In [None]:
sns.heatmap(corr,annot=True,cmap="coolwarm")

In [None]:
threshold=0.5
imp_nums=set()

for i in range(len(corr)):
    for j in range(i):
        if abs(corr.iloc[i,j])>=threshold:
            imp_nums.add(i)
imp_feat=[num_data.columns[x] for x in imp_nums]
non_imp_feat=set(num_data.columns).symmetric_difference(imp_feat)
imp_feat

**Dropping unnecessary columns**

In [None]:
num_data.drop(non_imp_feat,axis=1,inplace=True)
len(num_data)

# Scaling numeric variables

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
num_encoded=ss.fit_transform(num_data)
num_encoded=pd.DataFrame(num_encoded,columns=num_data.columns,index=num_data.index)
num_encoded

# Combining processed numeric and categoric data

In [None]:
X_train_final=pd.concat([cat_data_encoded,num_encoded],axis=1)
X_train_final

# Processing test data

In [None]:
cat_test=X_test[non_numeric_cols]
cat_test_encoded=ohe.transform(cat_test)
encoded_columns=ohe.get_feature_names_out(cat_test.columns)
cat_test_encoded=pd.DataFrame(cat_test_encoded,columns=encoded_columns,index=cat_test.index)
cat_test_encoded

In [None]:
Y_test=label.transform(Y_test)

In [None]:
X_test.head()

In [None]:
cat_test_encoded

In [None]:
cat_test_encoded.drop(non_imp_cat,axis=1,inplace=True)
cat_test_encoded

In [None]:
num_test=X_test[numeric_cols]
num_test=num_test.drop(set(numeric_cols).symmetric_difference(imp_feat),axis=1)
num_test

In [None]:
num_test_scaled=ss.transform(num_test)
num_test_scaled=pd.DataFrame(num_test_scaled,columns=num_test.columns,index=num_test.index)
num_test_scaled

In [None]:
X_test_final=pd.concat([cat_test_encoded,num_test_scaled],axis=1)
X_test_final

# Model Training and Tuning using Logistic Regression

In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
logreg = LogisticRegression(random_state=12,max_iter=10000)
param_grid = [
     {'C':np.logspace(-4,4,10),'penalty':['l1'],'solver':['liblinear','saga'],'class_weight': [None, 'balanced']},
    {'C':np.logspace(-4,4,10),'penalty':['l2'],'solver':['lbfgs','liblinear','saga'],'class_weight': [None, 'balanced']},
    {'penalty':[None],'solver':['saga','lbfgs'],'class_weight': [None, 'balanced']}
]
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid,
                           cv=6, scoring='accuracy', n_jobs=-1)

# Fit on training data
grid_search.fit(X_train_final,Y_train)

# Best parameters
print("Best Hyperparameters:", grid_search.best_params_)
y_pred = grid_search.predict(X_test_final)

# Evaluate
acc = accuracy_score(Y_test, y_pred)
print("Test Accuracy:", acc)
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, y_pred))
print("\nClassification Report:\n", classification_report(Y_test, y_pred))