In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import os

In [2]:
# Get working directory 
current_dir = os.getcwd()

# Go one directory up to the root directory 
project_root_dir = os.path.dirname(current_dir)
data_dir = os.path.join(project_root_dir, 'data')
raw_dir = os.path.join(data_dir,'raw')
processed_dir = os.path.join(data_dir,'processed')
# Define paths to results folder 
results_dir = os.path.join(project_root_dir,'results')
# Define paths to docs folder 
docs_dir = os.path.join(project_root_dir,'docs') 

#Create directories if they do not exist 
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(raw_dir,exist_ok= True)
os.makedirs(processed_dir, exist_ok=True)

## Read in the data

In [3]:

telecom_data_filename = os.path.join(raw_dir, "Telecom Churn Rate Dataset.xlsx")

telecom_df = pd.read_excel(
    telecom_data_filename,
    #header=None,
    na_values='?'
)

# Optional: clean whitespace
telecom_df = telecom_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

telecom_df.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,0,0,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0,0,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,0,0,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,0,0,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,Month-to-month,No,Mailed check,29.75,301.9,0,0,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,0,2,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,0,0,No


In [4]:
telecom_df.shape

(7043, 23)

In [5]:
telecom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data cleaning

In [6]:

#  Define the path
#raw_dir = r"C:\Users\User\Documents\ExKLAB\TelecomDemo\Telecom\data\raw"
telecom_data_filename = os.path.join(raw_dir, "Telecom Churn Rate Dataset.xlsx")

#  Load the data
telecom_df = pd.read_excel(
    telecom_data_filename,

    na_values='?'     # Handle missing values
)

#  Strip leading/trailing spaces from text
telecom_df = telecom_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

#  Set the column names
telecom_df.columns = [
    "customer_ID", "gender", "Senior_Citizen", "Partner", "Dependents",
    "tenure", "Phone_Service", "MultipleLines", "Internet_Service",
    "Online_Security", "Online_Backup", "Device_Protection", "Tech_Support",
    "Streaming_TV", "Streaming_Movies", "Contract", "Paper_less_Billing",
    "Payment_Method", "Monthly_Charges", "Total_Charges", "num_Admin_Tickets",
    "num_Tech_Tickets", "Churn"
]




In [7]:
telecom_df.head(10)

Unnamed: 0,customer_ID,gender,Senior_Citizen,Partner,Dependents,tenure,Phone_Service,MultipleLines,Internet_Service,Online_Security,...,Streaming_TV,Streaming_Movies,Contract,Paper_less_Billing,Payment_Method,Monthly_Charges,Total_Charges,num_Admin_Tickets,num_Tech_Tickets,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,0,0,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0,0,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,0,0,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,0,0,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,Month-to-month,No,Mailed check,29.75,301.9,0,0,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,0,2,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,0,0,No


## Understanding the dataset

In [8]:
telecom_df['Internet_Service'] = telecom_df['Internet_Service'].replace({
    'DSL': 'Digital Subscriber Line'
})

In [9]:
telecom_df

Unnamed: 0,customer_ID,gender,Senior_Citizen,Partner,Dependents,tenure,Phone_Service,MultipleLines,Internet_Service,Online_Security,...,Streaming_TV,Streaming_Movies,Contract,Paper_less_Billing,Payment_Method,Monthly_Charges,Total_Charges,num_Admin_Tickets,num_Tech_Tickets,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,Digital Subscriber Line,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,Digital Subscriber Line,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,0,0,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,Digital Subscriber Line,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,Digital Subscriber Line,Yes,...,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0,3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,0,0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,Digital Subscriber Line,Yes,...,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,0,0,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,0,5,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,Digital Subscriber Line,Yes,...,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0,0,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,0,0,Yes


## Deal with Missing Values

In [10]:
#  Missing values per column
missing_per_column = telecom_df.isnull().sum()
missing_per_column

customer_ID           0
gender                0
Senior_Citizen        0
Partner               0
Dependents            0
tenure                0
Phone_Service         0
MultipleLines         0
Internet_Service      0
Online_Security       0
Online_Backup         0
Device_Protection     0
Tech_Support          0
Streaming_TV          0
Streaming_Movies      0
Contract              0
Paper_less_Billing    0
Payment_Method        0
Monthly_Charges       0
Total_Charges         0
num_Admin_Tickets     0
num_Tech_Tickets      0
Churn                 0
dtype: int64

In [11]:
#  Missing values per row (we'll count rows that have at least one missing value)
missing_rows = telecom_df[telecom_df.isnull().any(axis=1)]
missing_rows

Unnamed: 0,customer_ID,gender,Senior_Citizen,Partner,Dependents,tenure,Phone_Service,MultipleLines,Internet_Service,Online_Security,...,Streaming_TV,Streaming_Movies,Contract,Paper_less_Billing,Payment_Method,Monthly_Charges,Total_Charges,num_Admin_Tickets,num_Tech_Tickets,Churn


In [12]:
# Total number of missing cells
total_missing_cells = telecom_df.isnull().sum().sum()
total_missing_cells 

0

In [13]:
telecom_df.duplicated().sum()

0

## save the Clean Dataset

In [14]:
pip install openpyxl


Note: you may need to restart the kernel to use updated packages.


In [16]:
# Create the directory if it doesn't exist
os.makedirs(processed_dir, exist_ok=True)
# Now save the file
final_file = os.path.join(processed_dir, 'Tecom.xlsx')
telecom_df.to_excel(final_file, index=False)

In [None]:
pwd