#### [Machine Learning Model Evaluations and Predictions]
   * [KNN]
   * [Decision Tree Classifier]
   * [Random Forest]

# Import Libraries

In [12]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score,classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

### Reading CSV data file

In [4]:
df = pd.read_csv('tel_churn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [5]:
df = df.drop('Unnamed: 0',axis = 1)

In [6]:
df.head(2)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0


# *Model Building Process*

### Split Dependent and Independent variables

In [7]:
x = df.drop('Churn',axis=1)

In [8]:
x.head(2)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0


In [9]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

# *1. K-Nearest Neighbors*

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [14]:
# Standardize the features(mean=0,std=1)
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [17]:
# Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [18]:
# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

In [19]:
# Make Prediction
y_pred = knn.predict(X_test)

In [20]:
# Evaluate the model
ac = accuracy_score(y_test,y_pred)
print("Accuracy Score = ",ac)
print()
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix = \n",cm)
print()
cr = classification_report(y_test,y_pred)
print("Classification Report = \n",cr)
print()

Accuracy Score =  0.7448471926083866

Confusion Matrix = 
 [[863 170]
 [189 185]]

Classification Report = 
               precision    recall  f1-score   support

           0       0.82      0.84      0.83      1033
           1       0.52      0.49      0.51       374

    accuracy                           0.74      1407
   macro avg       0.67      0.67      0.67      1407
weighted avg       0.74      0.74      0.74      1407




# *2. Decision Tree Classifier*

In [21]:
dt = DecisionTreeClassifier(criterion = 'gini',random_state = 100,max_depth=6, min_samples_leaf=8)

In [22]:
dt.fit(X_train,y_train)

In [23]:
y_pred = dt.predict(X_test)

In [25]:
dt.score(X_test,y_test)

0.7668798862828714

In [26]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1033
           1       0.56      0.59      0.57       374

    accuracy                           0.77      1407
   macro avg       0.70      0.71      0.71      1407
weighted avg       0.77      0.77      0.77      1407



###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

### Let's over-sample the data using SMOTE

In [None]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [31]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2,random_state=0)

In [32]:
dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [33]:
dt_smote.fit(xr_train,yr_train)
yr_predict = dt_smote.predict(xr_test)

In [35]:
# Check Performance Now,
model_score_r = dt_smote.score(xr_test, yr_test)
print('Score = ',model_score_r)
print('Classification Report = \n',metrics.classification_report(yr_test, yr_predict))

Score =  0.9475494411006019
Classification Report = 
               precision    recall  f1-score   support

           0       0.96      0.92      0.94       512
           1       0.94      0.97      0.95       651

    accuracy                           0.95      1163
   macro avg       0.95      0.95      0.95      1163
weighted avg       0.95      0.95      0.95      1163



In [36]:
cm = metrics.confusion_matrix(yr_test, yr_predict)
print('Confusion Matrix = \n',cm)

Confusion Matrix = 
 [[473  39]
 [ 22 629]]


In [38]:
ac = metrics.accuracy_score(yr_test, yr_predict)
print('Accuracy Score = \n',ac)

Accuracy Score = 
 0.9475494411006019


#### Now we can see quite better results, i.e. Accuracy: 94.75 %, and a very good recall, precision & f1 score for minority class.



#### Let's try with some other classifier.

# *3. Random Forest Classifier*

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
rf = RandomForestClassifier(n_estimators=100,criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)

In [41]:
# fit the model
rf.fit(X_train,y_train)

In [42]:
y_pred = rf.predict(X_test)

In [43]:
rf.score(X_test,y_test)

0.7789623312011372

In [44]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      1033
           1       0.63      0.42      0.50       374

    accuracy                           0.78      1407
   macro avg       0.72      0.66      0.68      1407
weighted avg       0.76      0.78      0.76      1407



In [45]:
print(accuracy_score(y_test, y_pred))

0.7789623312011372


In [46]:
print(confusion_matrix(y_test, y_pred))

[[939  94]
 [217 157]]


### Let's use over-sampled data using SMOTE 

In [None]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [None]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2,random_state=0)

In [47]:
rf_smote=RandomForestClassifier(n_estimators=100,criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)

In [48]:
rf_smote.fit(xr_train,yr_train)
yr_predict = dt_smote.predict(xr_test)

In [49]:
# Check Performance Now,
model_score_r = rf_smote.score(xr_test, yr_test)
print('Score = ',model_score_r)
print('Classification Report = \n',metrics.classification_report(yr_test, yr_predict))

Score =  0.945829750644884
Classification Report = 
               precision    recall  f1-score   support

           0       0.96      0.92      0.94       512
           1       0.94      0.97      0.95       651

    accuracy                           0.95      1163
   macro avg       0.95      0.95      0.95      1163
weighted avg       0.95      0.95      0.95      1163



In [50]:
cm = metrics.confusion_matrix(yr_test, yr_predict)
print('Confusion Matrix = \n',cm)

Confusion Matrix = 
 [[473  39]
 [ 22 629]]


In [51]:
ac = metrics.accuracy_score(yr_test, yr_predict)
print('Accuracy Score = \n',ac)

Accuracy Score = 
 0.9475494411006019


#### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree :)

#### Performing PCA

In [52]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train)
xr_test_pca = pca.transform(xr_test)
explained_variance = pca.explained_variance_ratio_

In [53]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [54]:
model.fit(xr_train_pca,yr_train)

In [55]:
yr_predict_pca = model.predict(xr_test_pca)

In [57]:
model_score_r_pca = model.score(xr_test_pca, yr_test)

In [58]:
print(model_score_r_pca)

0.7214101461736887


In [60]:
print(metrics.classification_report(yr_test, yr_predict_pca))

              precision    recall  f1-score   support

           0       0.72      0.60      0.66       512
           1       0.72      0.81      0.77       651

    accuracy                           0.72      1163
   macro avg       0.72      0.71      0.71      1163
weighted avg       0.72      0.72      0.72      1163



##### With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier.

# *Conclusion*

In this project, I achieved a 94.75% accuracy using the Random Forest model with the SMOTE technique on the Telecom Customer Churning dataset. Preprocessing and balancing the data were important steps, ensuring reliable and accurate classification. 

# --------------------------------------------Done----------------------------------