In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dtale
import datetime

import warnings
warnings.filterwarnings('ignore')

In [22]:
# read data
df_demograph = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerDemographic')

In [23]:
# create columns
df_demograph.columns = df_demograph.iloc[0]
df_demograph = df_demograph.iloc[1:]

In [24]:
df_demograph.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 1 to 4000
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   customer_id                          4000 non-null   object
 1   first_name                           4000 non-null   object
 2   last_name                            3875 non-null   object
 3   gender                               4000 non-null   object
 4   past_3_years_bike_related_purchases  4000 non-null   object
 5   DOB                                  3913 non-null   object
 6   job_title                            3494 non-null   object
 7   job_industry_category                3344 non-null   object
 8   wealth_segment                       4000 non-null   object
 9   deceased_indicator                   4000 non-null   object
 10  default                              3698 non-null   object
 11  owns_car                             4000 n

In [25]:
df_demograph = df_demograph.set_index('customer_id')

In [18]:
dtale.show(df_demograph)



In [27]:
# convert past_3_years_bike_related_purchases and customer_id to int
df_demograph['past_3_years_bike_related_purchases'] = df_demograph['past_3_years_bike_related_purchases'].astype(int)

In [28]:
# clean gender
df_demograph['gender'].replace({'U': 'Unknown', 'F': 'Female', 'Femal': 'Female', 'M': 'Male'}, inplace=True)
df_demograph['gender'].value_counts()

gender
Female     2039
Male       1873
Unknown      88
Name: count, dtype: int64

In [29]:
# create new column age and fill Nan
df_demograph['DOB'] = pd.to_datetime(df_demograph['DOB'])
current_date = datetime.datetime.now()
df_demograph['age'] = (current_date - df_demograph['DOB']).apply(lambda x: x.days // 365)
mean_age = df_demograph['age'].mean()
df_demograph['age'].fillna(mean_age, inplace=True)
df_demograph['age'] = df_demograph['age'].astype('int')

In [30]:
# delete default and DOB columns
df_demograph = df_demograph.drop(columns = 'default')
df_demograph = df_demograph.drop(columns = 'DOB')

In [31]:
# change columns deceased_indicator and owns_car to data 0 and 1
df_demograph['deceased_indicator'] = df_demograph['deceased_indicator'].apply(lambda x: 0 if x == 'N' else 1)
# df_demograph['deceased_indicator'] = df_demograph['deceased_indicator'].astype(bool)
df_demograph['owns_car'] = df_demograph['owns_car'].apply(lambda x: 0 if x == 'No' else 1)

In [32]:
df_demograph.head()

Unnamed: 0_level_0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,age
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Laraine,Medendorp,Female,93,Executive Secretary,Health,Mass Customer,0,1,11,69
2,Eli,Bockman,Male,81,Administrative Officer,Financial Services,Mass Customer,0,1,16,42
3,Arlin,Dearle,Male,61,Recruiting Manager,Property,Mass Customer,0,1,15,69
4,Talbot,,Male,33,,IT,Mass Customer,0,0,7,61
5,Sheila-kathryn,Calton,Female,56,Senior Editor,,Affluent Customer,0,1,8,46


In [33]:
df_demograph.isnull().sum()

0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                  87
age                                      0
dtype: int64

In [34]:
# fill Nan
df_demograph['last_name'] = df_demograph.groupby('first_name').last_name.bfill().ffill()
df_demograph['job_title'] = df_demograph['job_title'].fillna(method='bfill')
df_demograph['job_industry_category'] = df_demograph['job_industry_category'].fillna(method='ffill')
mean_tenure = df_demograph['tenure'].mean()
df_demograph['tenure'] = df_demograph['tenure'].fillna(mean_tenure)

In [35]:
df_demograph.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 1 to 4000
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           4000 non-null   object 
 1   last_name                            4000 non-null   object 
 2   gender                               4000 non-null   object 
 3   past_3_years_bike_related_purchases  4000 non-null   int32  
 4   job_title                            4000 non-null   object 
 5   job_industry_category                4000 non-null   object 
 6   wealth_segment                       4000 non-null   object 
 7   deceased_indicator                   4000 non-null   int64  
 8   owns_car                             4000 non-null   int64  
 9   tenure                               4000 non-null   float64
 10  age                                  4000 non-null   int32  
dtypes: float64(1), int32(2), int64(2), 

In [36]:
df_demograph.to_csv('Cust_Demograph_cleaned.csv')