In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
print("Libraries imported successfully.....")

In [None]:
#Importing data
df = pd.read_csv("telecom_customer_churn.csv")

print("Data imported successfully.....")

In [None]:
#Separate numerical and categorical columns

from preprolib import myfunctions
num_cols = []
cat_cols = []

ignore_list = ['Zip Code', 'Longitude', 'Latitude', 
                'Customer ID', 'Churn Category', 
                'Churn Reason', 'Customer Status', 'City']

myfunctions.cat_or_num(df, ignore_list, num_cols, cat_cols)

label = 'Customer Status'


In [None]:
plt.figure(figsize=(10, 6 * len(cat_cols)))

for idx, cat_col in enumerate(cat_cols, start=1):
    plt.subplot(len(cat_cols), 1, idx)
    sns.countplot(data=df, x=cat_col, hue='Customer Status', palette='pastel')
    plt.title(f"Count of 'Stayed', 'Churn' or 'Joined' for {cat_col}")
    plt.xlabel(cat_col)
    plt.ylabel('Count')

plt.tight_layout()
from scipy.stats import chi2_contingency
plt.show()

In [None]:
#distributions for all numeric variables 
for i in df[num_cols]:
    plt.hist(df[num_cols][i])
    plt.title(i)
    plt.show()

In [None]:
print(df[num_cols].corr())
sns.heatmap(df[num_cols].corr())

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Define the features and label
features = cat_cols + num_cols
label = 'Customer Status'

# Convert the label column to ordinal categories
label_encoder = OrdinalEncoder()
y = label_encoder.fit_transform(df[label].values.reshape(-1, 1))

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], y, test_size=0.3, random_state=0)

# Define a pipeline for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define a pipeline for categorical columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a ColumnTransformer to apply the pipeline to the numeric and categorical columns
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Fit the preprocessor to the training data and transform both the training and test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print('Training Set: %d, Test Set: %d \n' % (len(X_train), len(X_test)))

# Print the transformed DataFrames
print("X_train_Transformed:\n", X_train_transformed)
print("\nX_test_Transformed:\n", X_test_transformed)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier#
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score


from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

#mcm = confusion_matrix(y_test, predictions)

lr_model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000).fit(X_train_transformed, y_train)
kn_model = KNeighborsClassifier().fit(X_train_transformed, y_train)
dt_model = DecisionTreeClassifier().fit(X_train_transformed, y_train)
rf_model = RandomForestClassifier().fit(X_train_transformed, y_train)
nb_model = GaussianNB().fit(X_train_transformed, y_train)
ab_model = AdaBoostClassifier().fit(X_train_transformed, y_train)
#svc_model = SVC().fit(X_train_transformed,y_train)


models = [
    ('Logistic Regression', lr_model),
    ('K-Nearest Neighbors', kn_model),
    ('Decision Tree', dt_model),
    ('Random Forest', rf_model),
    ('Gaussian Naive Bayes', nb_model),
    ('AdaBoost', ab_model)
]


In [46]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Define lists to store model evaluation results


def evaluate_model(curr_model, model_name, X_test, y_test):
    model_names = []
    recall_scores = []
    f1_scores = []
    precision_scores = []
    accuracy_scores = []
    print(f'\nCurrent model is: {model_name}')
    predictions = curr_model.predict(X_test)

    # Calculate the necessary metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    # Append model name and metric scores to the respective lists
    model_names.append(model_name)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print classification report
    print('\nClassification Report:')
    print(classification_report(y_test, predictions))

# Example usage
for model_name, model in models:
    evaluate_model(model, model_name, X_test_transformed, y_test)

# Create a DataFrame to store the model evaluation results
model_evaluation_df = pd.DataFrame({
    'Model': model_names,
    'Recall': recall_scores,
    'F1-score': f1_scores,
    'Precision': precision_scores,
    'Accuracy': accuracy_scores
})
model_evaluation_df.sort_index()


Classification Report:
                 Model    Recall  F1-score  Precision  Accuracy
0  Logistic Regression  0.825367  0.825931   0.827337  0.825367


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)



Classification Report:
                 Model    Recall  F1-score  Precision  Accuracy
0  K-Nearest Neighbors  0.750592  0.749405   0.754928  0.750592

Classification Report:
           Model    Recall  F1-score  Precision  Accuracy
0  Decision Tree  0.773781  0.777225   0.782173  0.773781

Classification Report:
           Model    Recall  F1-score  Precision  Accuracy
0  Random Forest  0.836252   0.83073   0.829757  0.836252

Classification Report:
                  Model    Recall  F1-score  Precision  Accuracy
0  Gaussian Naive Bayes  0.730241  0.745078   0.790164  0.730241

Classification Report:
      Model    Recall  F1-score  Precision  Accuracy
0  AdaBoost  0.751065  0.693595   0.668214  0.751065


  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
# Print the DataFrame with model evaluation results
# Sort the DataFrame by accuracy in descending order
model_evaluation_df.sort_values(by='Accuracy', ascending=False, inplace=True)
model_evaluation_df.reset_index(drop=True, inplace=True)

print("\nModel Evaluation:")
print(model_evaluation_df)


NameError: name 'model_evaluation_df' is not defined