In [1]:
import pandas as pd
import numpy as np
import snakecase

In [2]:
# 1. Import the necessary libraries load the data and start a new notebook. Using the same data as the previous lab:
# we_fn_use_c_marketing_customer_value_analysis.csv
customer_df = pd.read_csv('we_fn_use_c_marketing_customer_value_analysis.csv')
customer_df.columns = [snakecase.convert(col.replace(' ','_')).replace('__','_') for col in customer_df.columns]
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'],errors='coerce')
customer_df.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employment_status', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [3]:
# 2. Find all of the categorical data. Save it in a categorical_df variable.
categoricals = customer_df.select_dtypes(object)
categoricals.head()

Unnamed: 0,customer,state,response,coverage,education,employment_status,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,BU79786,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,QZ44356,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,AI49188,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,WW63253,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,Medsize
4,HB64268,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,Medsize


In [4]:
# 3. Check for NaN values and decide what to do with them, do it now.
categoricals.isna().sum()

customer             0
state                0
response             0
coverage             0
education            0
employment_status    0
gender               0
location_code        0
marital_status       0
policy_type          0
policy               0
renew_offer_type     0
sales_channel        0
vehicle_class        0
vehicle_size         0
dtype: int64

In [5]:
# 4. Check all unique values of columns.
for col in categoricals.columns:
    print(col, ': ',categoricals[col].unique())

customer :  ['BU79786' 'QZ44356' 'AI49188' ... 'TD14365' 'UP19263' 'Y167826']
state :  ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon']
response :  ['No' 'Yes']
coverage :  ['Basic' 'Extended' 'Premium']
education :  ['Bachelor' 'College' 'Master' 'High School or Below' 'Doctor']
employment_status :  ['Employed' 'Unemployed' 'Medical Leave' 'Disabled' 'Retired']
gender :  ['F' 'M']
location_code :  ['Suburban' 'Rural' 'Urban']
marital_status :  ['Married' 'Single' 'Divorced']
policy_type :  ['Corporate Auto' 'Personal Auto' 'Special Auto']
policy :  ['Corporate L3' 'Personal L3' 'Corporate L2' 'Personal L1' 'Special L2'
 'Corporate L1' 'Personal L2' 'Special L1' 'Special L3']
renew_offer_type :  ['Offer1' 'Offer3' 'Offer2' 'Offer4']
sales_channel :  ['Agent' 'Call Center' 'Web' 'Branch']
vehicle_class :  ['Two-Door Car' 'Four-Door Car' 'SUV' 'Luxury SUV' 'Sports Car'
 'Luxury Car']
vehicle_size :  ['Medsize' 'Small' 'Large']


In [6]:
# 5. Check dtypes. Do they all make sense as categorical data?
categoricals.dtypes

customer             object
state                object
response             object
coverage             object
education            object
employment_status    object
gender               object
location_code        object
marital_status       object
policy_type          object
policy               object
renew_offer_type     object
sales_channel        object
vehicle_class        object
vehicle_size         object
dtype: object

In [7]:
# All data types make sense. The effecive_to_date makes sense as it is not numerical, hence must be object.

In [8]:
# 6. Does any column contain alpha and numeric data? Decide how to clean it and do it now.
categoricals['renew_offer_type'].value_counts()

Offer1    3752
Offer2    2926
Offer3    1432
Offer4    1024
Name: renew_offer_type, dtype: int64

In [9]:
import re
pattern = '\d'
categoricals['renew_offer_type'] = categoricals['renew_offer_type'].map(lambda x: re.findall(pattern,x)[0])

In [10]:
# 7. Would you choose to do anything else to clean or wrangle the categorical data? Comment your decisions and do it now.

In [11]:
# Dropping two columns that seem irrelevant. Further adjustments will be made ahead.
categoricals.drop(columns = ['customer','renew_offer_type'], axis=1, inplace=True)

In [12]:
# 8. Compare policy_type and policy. What information is contained in these columns. Can you identify what is important?
(categoricals['policy'] + ' ' + categoricals['policy_type']).value_counts()

Personal L3 Personal Auto      3426
Personal L2 Personal Auto      2122
Personal L1 Personal Auto      1240
Corporate L3 Corporate Auto    1014
Corporate L2 Corporate Auto     595
Corporate L1 Corporate Auto     359
Special L2 Special Auto         164
Special L3 Special Auto         148
Special L1 Special Auto          66
dtype: int64

In [13]:
pattern = '\d'
categoricals['policy'] = categoricals['policy'].map(lambda x: re.findall(pattern,x)[0])
categoricals['policy'].value_counts()

3    4588
2    2881
1    1665
Name: policy, dtype: int64

In [14]:
# 9. Check number of unique values in each column, can they be combined in any way to ease encoding?
# Comment your thoughts and make those changes.
categoricals.nunique()

state                5
response             2
coverage             3
education            5
employment_status    5
gender               2
location_code        3
marital_status       3
policy_type          3
policy               3
sales_channel        4
vehicle_class        6
vehicle_size         3
dtype: int64

In [15]:
categoricals['response'].replace({'Yes':1,'No':0},inplace=True)
categoricals['vehicle_size'].replace({'Small':1,'Medsize':2,'Large':3},inplace=True)
categoricals['education'].replace({'High School or Below':1,'Bachelor':2,'College':2,'Master':3,'Doctor':4},inplace=True)
categoricals['coverage'].replace({'Basic':1,'Extended':2,'Premium':3},inplace=True)
categoricals['gender'].replace({'M':1,'F':0},inplace=True)

In [16]:
# 10. Save the cleaned catagorical dataframe as categorical.csv You will use this file again this week.
categoricals.to_csv('categoricals_df.csv')