In [18]:
import pandas as pd

In [20]:
df = pd.read_csv('ecommercedata.csv') 

## Understanding the size of the dataset

In [76]:
df

Unnamed: 0,customer_id,name,email,age,gender,annual_income,total_purchases,avg_purchase_value,days_since_last_purchase,customer_satisfaction,churn
0,a3071abd-8922-4608-95af-676364cb411d,Erica Powers,houseariana@example.net,56,Male,41672,11,170.75,14,5,0
1,114a0236-61b3-4447-964c-dbe6ac3d0773,Sarah Jones,ufranklin@example.org,46,Female,78217,9,63.17,8,4,0
2,c2fcf312-df91-400a-ab0e-822c1fab051a,Jaime Peterson,james10@example.com,32,Female,28279,10,55.09,8,2,0
3,7885497e-eb68-4c32-b4a3-b110d6af8aae,Rebecca Wolfe,pricedustin@example.org,60,Female,17017,15,110.88,18,4,0
4,b212448f-04ae-4943-a90e-0c8500b486fc,Kenneth Peterson,qwyatt@example.net,25,Male,56600,10,158.37,42,3,1
...,...,...,...,...,...,...,...,...,...,...,...
995,993b714d-401b-49b7-acb5-012405812705,David Garcia,ronaldparker@example.net,22,Male,36209,10,117.94,30,3,0
996,28baa956-01c4-4a35-963e-cd3dc47ba6d2,Robert Nelson,mary75@example.com,40,Male,12045,10,119.31,24,2,0
997,75c38180-3083-4e5d-a0bb-2617d2414f31,William Walker,kevin12@example.net,27,Female,45709,11,84.18,40,2,0
998,4c978c03-0bd6-44b9-b3ea-ecd3b3e8fc5f,Hannah Mccoy,snyderdavid@example.net,61,Male,66514,10,100.51,12,5,1


In [25]:
df.shape

(1000, 11)

# Cleaning the data

## For this I will be:
#### - Checking missing values
#### - Correct datatypes (if needed)
#### - Inconsistencies, ex: Male, male....
#### - Remove duplicates
#### - Extra - Standarization on Data

## -------------------------------------------------------------------------------------------

## Checking missing values
#### Make sure to count any missing value in the whole dataset

In [31]:
df.isnull().sum()

customer_id                 0
name                        0
email                       0
age                         0
gender                      0
annual_income               0
total_purchases             0
avg_purchase_value          0
days_since_last_purchase    0
customer_satisfaction       0
churn                       0
dtype: int64

#### No nulls means we don't need to drop any row

## Correct datatypes
#### Need to see the data types for each column and make adjustment if needed, ex: a number column as string or viceversa

In [35]:
df.dtypes

customer_id                  object
name                         object
email                        object
age                           int64
gender                       object
annual_income                 int64
total_purchases               int64
avg_purchase_value          float64
days_since_last_purchase      int64
customer_satisfaction         int64
churn                         int64
dtype: object

#### Looking at the types, seems like the numeric columns are correctly set-up as well as the string columns which are as object which could be string+integer in a column which is helpful for columns like 'customer_id' and 'email'. No need for changes

## Inconsistencies

#### Checking to see if there is any other value in gender other than Male or Female 

In [71]:
df['gender'].value_counts()

gender
Male      526
Female    474
Name: count, dtype: int64

#### Looks like there is no inconsistency such as upper and lower case or any other gender

## Duplicates

#### Need to check the columns where duplicates could mean wrong data, for this dataset we will be checking for 'customer_id' and 'email' since they are supposed to be unique

In [90]:
df[df.duplicated(['email'], keep=False)]

Unnamed: 0,customer_id,name,email,age,gender,annual_income,total_purchases,avg_purchase_value,days_since_last_purchase,customer_satisfaction,churn
419,90be434f-c518-4098-88ef-0b1b064f0905,Ryan Morris,lrussell@example.net,47,Male,46628,10,29.21,2,4,1
633,cd5acafd-8c7a-423c-b78b-b5f43cb1179a,Monica Rojas,lrussell@example.net,44,Female,63260,4,74.29,16,3,0
721,97f14705-fca1-401b-9fac-1492984173b3,Billy Smith,wgonzalez@example.com,18,Male,54886,11,166.87,18,2,1
916,232c3e46-cf3b-43e7-a77a-439f3b67deba,Mrs. Nicole Weaver,wgonzalez@example.com,35,Male,68985,9,107.35,12,4,0
