In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./Expresso_churn_dataset.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './Expresso_churn_dataset.csv'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154048 entries, 0 to 2154047
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         object 
 1   REGION          object 
 2   TENURE          object 
 3   MONTANT         float64
 4   FREQUENCE_RECH  float64
 5   REVENUE         float64
 6   ARPU_SEGMENT    float64
 7   FREQUENCE       float64
 8   DATA_VOLUME     float64
 9   ON_NET          float64
 10  ORANGE          float64
 11  TIGO            float64
 12  ZONE1           float64
 13  ZONE2           float64
 14  MRG             object 
 15  REGULARITY      int64  
 16  TOP_PACK        object 
 17  FREQ_TOP_PACK   float64
 18  CHURN           int64  
dtypes: float64(12), int64(2), object(5)
memory usage: 312.2+ MB


In [None]:
### DATA CLEANING
# Drop unnecessary columns
df.drop(columns=['user_id', 'TENURE', "TOP_PACK"], inplace=True)

In [None]:
# Fill in the categorical columns with the most frequent value
categorical_columns = df.select_dtypes(include=['object']).columns

for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)



In [None]:
# Fill in the numerical columns with the median value
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_columns:
    df[col].fillna(df[col].median(), inplace=True)

In [None]:
df.isnull().sum() #check if there are any null values left

REGION            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
MRG               0
REGULARITY        0
FREQ_TOP_PACK     0
CHURN             0
dtype: int64

In [None]:
#save the cleaned data
df.to_csv('cleaned_data.csv', index=False)

In [None]:
# Initialize the LabelEncoder
le = LabelEncoder()

# Encode categorical variables
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])
    
    # Show the mapping of original labels to encoded numbers
    label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Encoding for column '{col}':")
    for label, number in label_mapping.items():
        print(f"  '{label}' : {number}")
    print("-" * 40)


Encoding for column 'REGION':
  'DAKAR' : 0
  'DIOURBEL' : 1
  'FATICK' : 2
  'KAFFRINE' : 3
  'KAOLACK' : 4
  'KEDOUGOU' : 5
  'KOLDA' : 6
  'LOUGA' : 7
  'MATAM' : 8
  'SAINT-LOUIS' : 9
  'SEDHIOU' : 10
  'TAMBACOUNDA' : 11
  'THIES' : 12
  'ZIGUINCHOR' : 13
----------------------------------------
Encoding for column 'MRG':
  'NO' : 0
----------------------------------------


In [None]:
### Split the data into features and target variable
features = df.drop(columns=['CHURN'])
labels = df['CHURN']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
### Train the Random Forest Classifier
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model to the training data
rf_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

In [None]:
y_pred[:10] # Check the first 10 predictions

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92    349773
           1       0.66      0.61      0.64     81037

    accuracy                           0.87    430810
   macro avg       0.79      0.77      0.78    430810
weighted avg       0.87      0.87      0.87    430810



In [None]:
# Save the model using pickle
import pickle
# Save the model to a file and label encoder
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)
with open('rf_classifier.pkl', 'wb') as model_file:
    pickle.dump(rf_classifier, model_file)


In [None]:
#use streamlit to create a web app
import streamlit as st
import pandas as pd
import pickle


# Load the model
with open('rf_classifier.pkl', 'rb') as model_file:
    rf_classifier = pickle.load(model_file)

# Load the dataset
df = pd.read_csv('./Expresso_churn_dataset.csv')
# Create a Streamlit app
st.title("Churn Prediction App")
st.write("This app predicts whether a customer will churn or not based on their data.")


# Create input fields for the user to enter data
region = st.selectbox("Region", options=["North", "South", "East", "West"])
montant = st.number_input("Montant", min_value=df["MONTANT"].min(), max_value=df["MONTANT"].max())
frequency_rech = st.slider("Frequency of Recharge", min_value=df["FREQUENCY_RECH"].min(), max_value=df["FREQUENCY_RECH"].max())
revenue = st.number_input("Revenue", min_value=df["REVENUE"].min(), max_value=df["REVENUE"].max())






2025-04-07 11:53:41.847 
  command:

    streamlit run /Users/samswift/miniconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
