#### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
### AUTHOR - CHANDAN D. CHAUDHARI

#### DATASETLINK - https://raw.githubusercontent.com/chandanc5525/Dataset/main/Data/telecom_churn.csv 
#### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [1]:
!pip install klib

Defaulting to user installation because normal site-packages is not writeable


### PROBLEM STATEMENT

Customer churn is a major problem and one of the most important concerns for large companies. Due to the direct effect on the revenues of the companies, especially in the telecom feld, companies are seeking to develop means to predict potential customer to churn. Therefore, fnding factors that increase customer churn is important to take necessary actions to reduce this churn.

In [2]:
# Importing Python Libraries
import numpy as np
import pandas as pd

# Importing Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing Dataset

URL = "https://raw.githubusercontent.com/chandanc5525/Dataset/main/Data/"

df = pd.read_csv(URL + 'telecom_churn.csv')

# Checking random 10 Sample of the dataset
df.sample(10)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
2138,WY,124,415,No,No,0,178.3,102,30.31,235.0,120,19.98,239.7,119,10.79,10.9,1,2.94,3,False
1331,OK,101,408,No,No,0,209.6,107,35.63,228.8,96,19.45,172.4,85,7.76,7.6,2,2.05,3,False
1493,MT,126,415,No,Yes,30,153.4,90,26.08,151.4,97,12.87,153.8,97,6.92,12.8,4,3.46,4,True
448,GA,75,415,No,No,0,117.3,114,19.94,201.1,61,17.09,107.9,82,4.86,12.2,3,3.29,1,False
2746,CO,107,415,No,No,0,204.5,108,34.77,162.4,110,13.8,155.0,102,6.98,13.4,1,3.62,3,False
1538,WV,113,415,No,No,0,61.2,111,10.4,92.3,88,7.85,197.4,114,8.88,13.7,3,3.7,5,True
76,DC,82,415,No,No,0,300.3,109,51.05,181.0,100,15.39,270.1,73,12.15,11.7,4,3.16,0,True
2336,NY,122,415,No,Yes,37,163.0,107,27.71,312.8,118,26.59,200.0,85,9.0,11.6,5,3.13,1,False
1438,ND,60,510,No,No,0,203.2,99,34.54,235.8,131,20.04,224.9,112,10.12,15.1,6,4.08,2,False
1870,AZ,43,415,No,No,0,179.3,97,30.48,252.7,126,21.48,227.5,114,10.24,8.0,5,2.16,0,False


In [4]:
# Checking Shape of the dataset
df.shape

(3333, 20)

In [5]:
# Checking Missing values in the dataset

df.isnull().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [6]:
# Checking Descriptive Stats for Categorical Column

df.describe(include='all')

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
count,3333,3333.0,3333.0,3333,3333,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333
unique,51,,,2,2,,,,,,,,,,,,,,,2
top,WV,,,No,No,,,,,,,,,,,,,,,False
freq,106,,,3010,2411,,,,,,,,,,,,,,,2850
mean,,101.064806,437.182418,,,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856,
std,,39.822106,42.37129,,,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491,
min,,1.0,408.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0,
25%,,74.0,408.0,,,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0,
50%,,101.0,415.0,,,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0,
75%,,127.0,510.0,,,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0,


In [7]:
# Feature Columns

df.columns

Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')

In [8]:
# Checking Information about the Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

In [9]:
features = df[['State','International plan','Voice mail plan','Churn']]
for i in features:
    print('-----------------------')
    print(i)
    print(features[i].unique())
    print(features[i].value_counts())
    print('-----------------------')

-----------------------
State
['KS' 'OH' 'NJ' 'OK' 'AL' 'MA' 'MO' 'LA' 'WV' 'IN' 'RI' 'IA' 'MT' 'NY'
 'ID' 'VT' 'VA' 'TX' 'FL' 'CO' 'AZ' 'SC' 'NE' 'WY' 'HI' 'IL' 'NH' 'GA'
 'AK' 'MD' 'AR' 'WI' 'OR' 'MI' 'DE' 'UT' 'CA' 'MN' 'SD' 'NC' 'WA' 'NM'
 'NV' 'DC' 'KY' 'ME' 'MS' 'TN' 'PA' 'CT' 'ND']
WV    106
MN     84
NY     83
AL     80
WI     78
OH     78
OR     78
WY     77
VA     77
CT     74
MI     73
ID     73
VT     73
TX     72
UT     72
IN     71
MD     70
KS     70
NC     68
NJ     68
MT     68
CO     66
NV     66
WA     66
RI     65
MA     65
MS     65
AZ     64
FL     63
MO     63
NM     62
ME     62
ND     62
NE     61
OK     61
DE     61
SC     60
SD     60
KY     59
IL     58
NH     56
AR     55
GA     54
DC     54
HI     53
TN     53
AK     52
LA     51
PA     45
IA     44
CA     34
Name: State, dtype: int64
-----------------------
-----------------------
International plan
['No' 'Yes']
No     3010
Yes     323
Name: International plan, dtype: int64
-----------------------
-------

### OBSERVATIONS

1. THE ABOVE DATASET CONTAINS 3333 ROWS AND 20 COLUMNS
2. NULL VALUES ARE ABSENT IN THE GIVEN DATASET
3. CHURN COLUMN IS BOOLEANS, SO WE NEED TO CONVERT INTO NUMERICAL FORM