# Contents
1. Imports
2. Checks
3. Wrangling
4. Cleaning
5. Exports

# 1. Imports

In [2]:
#Libraries
import pandas as pd
import numpy as np
import os

In [3]:
#Path
path = r'/Users/davidgriesel/Documents/GitHub/202409_OGS'

In [4]:
#Dataset
df_customers = pd.read_csv(os.path.join(path, '02 - Data', 'Original Data', 'customers.csv'))

# 2. Checks

In [5]:
#Dimensions
df_customers.shape

(206209, 10)

In [6]:
#Preview
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
##### Summary stats
df_customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


##### Observations:
- Dataset contains 206,209 records

# 3. Wrangling

## 3.1. Drop Columns

### 3.1.1. Identify Redundant Columns

In [8]:
#Check frequency distribution of variable
df_customers['user_id'].value_counts(dropna = False)

user_id
26711     1
67322     1
173044    1
61044     1
98344     1
         ..
146847    1
154991    1
172193    1
184326    1
80148     1
Name: count, Length: 206209, dtype: int64

##### Observations:
- 206,209 values ranging between 1 to 206,209
- With 206,209 records these are values likely to be unique and/or sequential meaning dataset is complete
- Retain column

In [9]:
#Check frequency distribution of variable
df_customers['First Name'].value_counts(dropna = False)

First Name
NaN        11259
Marilyn     2213
Barbara     2154
Todd        2113
Jeremy      2104
           ...  
Merry        197
Eugene       197
Garry        191
Ned          186
David        186
Name: count, Length: 208, dtype: int64

##### Observations:
- 208 unique values including 11,259 NaN values
- Variable contains personal Identifiable Information
- Remove column

In [10]:
#Check frequency distribution of variable
df_customers['Surnam'].value_counts(dropna = False)

Surnam
Hamilton      252
Randall       248
Lamb          243
Pennington    243
Barnett       242
             ... 
Poole         172
Bauer         166
Pearson       164
Payne         163
Jordan        162
Name: count, Length: 1000, dtype: int64

##### Observations:
- 1000 Surnames
- Variable contains personal Identifiable Information
- Remove column

In [11]:
#Check frequency distribution of variable
df_customers['Gender'].value_counts(dropna = False)

Gender
Male      104067
Female    102142
Name: count, dtype: int64

##### Observations:
- 2 values for Male and Female
- Contains information that could aid identification but does not require removal
- Retain column

In [12]:
#Check frequency distribution of variable
df_customers['STATE'].value_counts(dropna = False)

STATE
Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana 

In [13]:
df_customers['STATE'].value_counts(dropna=False).count()

51

##### Observations:
- 51 values, one for each state plus the District of Columbia
- Retain column

In [14]:
#Check frequency distribution of variable
df_customers['Age'].value_counts(dropna = False)

Age
19    3329
55    3317
51    3317
56    3306
32    3305
      ... 
65    3145
25    3127
66    3114
50    3102
36    3101
Name: count, Length: 64, dtype: int64

##### Observations:
- 64 values ranging between 18 and 81
- Retain column

In [15]:
#Check frequency distribution of variable
df_customers['date_joined'].value_counts(dropna = False)

date_joined
9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: count, Length: 1187, dtype: int64

##### Observations: 
- 1,187 dates
- Incorrect format likely to be due to inappropriate data type
- Retain column

In [16]:
#Check frequency distribution of variable
df_customers['n_dependants'].value_counts(dropna = False)

n_dependants
0    51602
3    51594
1    51531
2    51482
Name: count, dtype: int64

##### Observations: 
- 4 values indicating customers have between 0 and 3 dependants
- Retain column

In [17]:
#Check frequency distribution of variable
df_customers['fam_status'].value_counts(dropna = False)

fam_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: count, dtype: int64

##### Observations: 
- 4 values indicating different family statuses
- Retain column

In [18]:
#Check frequency distribution of variable
df_customers['income'].value_counts(dropna = False)

income
57192     10
95891     10
95710     10
97532      9
98675      9
          ..
73141      1
71524      1
74408      1
44780      1
148828     1
Name: count, Length: 108012, dtype: int64

##### Observations: 
- 108,102 values ranging between 25,903 and 593,901
- Retain column

### 3.1.2. Address Redundant Columns

In [19]:
#Drop eval_set variable and update dataframe
df_customers = df_customers.drop(columns = ['First Name', 'Surnam'])

In [20]:
#Confirm results
df_customers.shape

(206209, 8)

In [21]:
#Preview table
df_customers.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


##### Observations:
- Columns dropped successfully

## 3.2. Rename Columns

### 3.2.1. Identify Unclear Descriptions

In [22]:
#Check Column Descriptions
df_customers.columns

Index(['user_id', 'Gender', 'STATE', 'Age', 'date_joined', 'n_dependants',
       'fam_status', 'income'],
      dtype='object')

##### Observations:
- Inconsistencies in column headers

### 3.2.2. Address Unclear Descriptions

In [23]:
#Rename vague column descriptions
df_customers.rename(columns = {'Gender' : 'gender',
                        'STATE' : 'state',
                        'Age' : 'age',
                        'n_dependants' : 'number_of_dependants',
                        'fam_status' : 'marital_status'}, inplace = True)

In [24]:
#Confirm results
df_customers.columns

Index(['user_id', 'gender', 'state', 'age', 'date_joined',
       'number_of_dependants', 'marital_status', 'income'],
      dtype='object')

##### Observattions:
- Variables successfully renamed

## 3.3. Data Types

### 3.3.1. Identify Inconsistent Data Types

In [25]:
#Check data types
df_customers.dtypes

user_id                  int64
gender                  object
state                   object
age                      int64
date_joined             object
number_of_dependants     int64
marital_status          object
income                   int64
dtype: object

##### Observations:
- Data type of date_joined is inconsistent contents

### 3.3.2. Address Inconsistent Data Types

In [26]:
#Change type to datetime from format m/d/y
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'], format='%m/%d/%Y')

In [27]:
#Confirm results
df_customers.dtypes

user_id                          int64
gender                          object
state                           object
age                              int64
date_joined             datetime64[ns]
number_of_dependants             int64
marital_status                  object
income                           int64
dtype: object

In [28]:
#Check frequency distribution of variable
df_customers['date_joined'].value_counts(dropna = False)

date_joined
2018-09-17    213
2018-02-10    212
2019-04-01    211
2019-09-21    211
2017-12-19    210
             ... 
2018-09-01    141
2018-01-22    140
2017-11-24    139
2019-07-18    138
2018-08-06    128
Name: count, Length: 1187, dtype: int64

##### Observations:
- Data type successfully changed

# 4. Cleaning

## 4.1. Accuracy

### 4.1.1. Identify Inaccurate Values

In [29]:
#Review descriptive statistics
df_customers.describe()

Unnamed: 0,user_id,age,date_joined,number_of_dependants,income
count,206209.0,206209.0,206209,206209.0,206209.0
mean,103105.0,49.501646,2018-08-17 03:06:30.029532928,1.499823,94632.852548
min,1.0,18.0,2017-01-01 00:00:00,0.0,25903.0
25%,51553.0,33.0,2017-10-23 00:00:00,0.0,59874.0
50%,103105.0,49.0,2018-08-16 00:00:00,1.0,93547.0
75%,154657.0,66.0,2019-06-10 00:00:00,3.0,124244.0
max,206209.0,81.0,2020-04-01 00:00:00,3.0,593901.0
std,59527.555167,18.480962,,1.118433,42473.786988


In [30]:
#Number of unique values per variable
df_customers.nunique()

user_id                 206209
gender                       2
state                       51
age                         64
date_joined               1187
number_of_dependants         4
marital_status               4
income                  108012
dtype: int64

- ##### Observations:
- The dataset contains 206,209 unique user_id's
- There are 2 genders
- There are 51 states including District of Columbia
- Customers are between 18 and 81 years old, most being between 33 and 66 years old, and 49 years old on average
- Customers joined between 2017-01-01 and 2020-04-01
- The number_of_dependants per customer range between 0 and 3 and are spread evenly across the population
- There are 4 variables for family-status
- income ranges between 25,903 and 593,901 with most customers earning between 59,874 and 122,244 and 94,632 on average

### 4.1.2. Address Inaccurate Values

##### Observations
- No inaccuracies noted

## 4.2. Missing Values

### 4.2.1. Identify missing values

In [31]:
df_customers.isnull().sum()

user_id                 0
gender                  0
state                   0
age                     0
date_joined             0
number_of_dependants    0
marital_status          0
income                  0
dtype: int64

##### Observations:
- None of the variables had NaN values

### 4.2.2. Address Missing Values

##### Observations:
- No missing values were identified

## 4.3. Mixed Type Variables

### 4.3.1. Find Mixed Type Variables

In [32]:
#Finding mixed type data
for col in df_customers.columns.tolist():
    weird = (df_customers[[col]].map(type) != df_customers[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_customers[weird]) > 0:
        print (col)

##### Observations:
- No variables returned

### 4.3.2. Address Mixed Type Variables

##### Observations:
- No variables identified with with mixed type data

## 4.4. Duplicates

### 4.4.1. Find Duplicates

In [33]:
#Identify duplicates, create and view subset with results
df_customers_duplicates = df_customers[df_customers.duplicated()]
df_customers_duplicates

Unnamed: 0,user_id,gender,state,age,date_joined,number_of_dependants,marital_status,income


In [34]:
#Confirm dimensions
df_customers_duplicates.shape

(0, 8)

##### Observations:
- No records returned

### 4.4.2. Address Duplicates

##### Observations
- No duplicate records identified

# 5. Exports

In [35]:
#Confirm dimensions
df_customers.shape

(206209, 8)

In [36]:
#Export cleaned dataset
df_customers.to_pickle(os.path.join(path, '02 - Data', 'Prepared Data', '05_cleaned_customers.pkl'))