In [28]:
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
#  Load data
df = pd.read_csv('../data/raw/Dataset_ATS_v2.csv') 
df.head()

Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,Female,0,No,1,No,No,DSL,Month-to-month,25,Yes
1,Male,0,No,41,Yes,No,DSL,One year,25,No
2,Female,0,Yes,52,Yes,No,DSL,Month-to-month,19,No
3,Female,0,No,1,Yes,No,DSL,One year,76,Yes
4,Male,0,No,67,Yes,No,Fiber optic,Month-to-month,51,No


In [30]:
#  Dataset size
df_rows, df_cols = df.shape
print(f'Dataset size: {df_rows} rows × {df_cols} columns')

Dataset size: 7043 rows × 10 columns


In [31]:
# Data types
print(df.dtypes)

gender             object
SeniorCitizen       int64
Dependents         object
tenure              int64
PhoneService       object
MultipleLines      object
InternetService    object
Contract           object
MonthlyCharges      int64
Churn              object
dtype: object


In [32]:
# Summary statistics for numerical features
numeric_cols = df.select_dtypes(include=['number']).columns
print('\nSummary statistics for numerical features:')
print(
    df[numeric_cols]
      .describe()
      .T[['mean', '50%', 'std', 'min', 'max']]
      .rename(columns={'50%': 'median'})
)


Summary statistics for numerical features:
                     mean  median        std   min    max
SeniorCitizen    0.162147     0.0   0.368612   0.0    1.0
tenure          32.371149    29.0  24.559481   0.0   72.0
MonthlyCharges  64.758768    70.0  30.091650  18.0  119.0


In [33]:
# Count of unique values for each categorical attribute
cat_cols = df.select_dtypes(include=['object', 'category']).columns
print('\nCount of unique values for each categorical attribute:')
print(df[cat_cols].nunique())


Count of unique values for each categorical attribute:
gender             2
Dependents         2
PhoneService       2
MultipleLines      2
InternetService    2
Contract           3
Churn              2
dtype: int64


In [34]:
# Frequent values for categorical variables
print('\nFrequent values for categorical variables:')
for col in cat_cols:
    print(f'\n{col}:')
    print(df[col].value_counts().head(3))


Frequent values for categorical variables:

gender:
gender
Male      3555
Female    3488
Name: count, dtype: int64

Dependents:
Dependents
No     4933
Yes    2110
Name: count, dtype: int64

PhoneService:
PhoneService
Yes    6361
No      682
Name: count, dtype: int64

MultipleLines:
MultipleLines
No     4072
Yes    2971
Name: count, dtype: int64

InternetService:
InternetService
DSL            3947
Fiber optic    3096
Name: count, dtype: int64

Contract:
Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

Churn:
Churn
No     5174
Yes    1869
Name: count, dtype: int64


In [35]:
# Identify duplicate rows in the dataset
dup_rows_count = df.duplicated().sum()
print(f'Identify duplicate rows in the dataset: {dup_rows_count}')


print(df[df.duplicated()].head())

Identify duplicate rows in the dataset: 302
    gender  SeniorCitizen Dependents  tenure PhoneService MultipleLines  \
78    Male              0         No       1          Yes            No   
97    Male              0         No       1          Yes            No   
352   Male              0         No       1          Yes            No   
398   Male              0         No       1          Yes            No   
402   Male              0         No       1          Yes           Yes   

    InternetService        Contract  MonthlyCharges Churn  
78              DSL  Month-to-month              20    No  
97              DSL  Month-to-month              20    No  
352     Fiber optic  Month-to-month              20    No  
398             DSL  Month-to-month              20    No  
402     Fiber optic  Month-to-month              20   Yes  


In [36]:
# Frequency of duplicated values in each attribute
dup_counts = df.apply(lambda col: col.duplicated().sum()) \
               .rename('duplicate_count') \
               .to_frame()
print('\nFrequency of duplicated values in each attribute:')
print(dup_counts)


Frequency of duplicated values in each attribute:
                 duplicate_count
gender                      7041
SeniorCitizen               7041
Dependents                  7041
tenure                      6970
PhoneService                7041
MultipleLines               7041
InternetService             7041
Contract                    7040
MonthlyCharges              6942
Churn                       7041


In [37]:
# Candidate primary key columns
candidate_keys = dup_counts[dup_counts['duplicate_count'] == 0].index
candidate_keys = [col for col in candidate_keys if df[col].isna().sum() == 0]
print('\nCandidate primary key columns:')
print(candidate_keys)


Candidate primary key columns:
[]
