In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Import data
shopping_df = pd.read_csv('../resources/shopping_data.csv', encoding='ISO-8859-1')
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
# Columns
shopping_df.columns
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [4]:
# Find null values
for column in shopping_df.columns:
    print(f'Column {column} has {shopping_df[column].isnull().sum()} null values')

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [5]:
# Drop null rows
shopping_df = shopping_df.dropna()

In [6]:
# Find duplicate entries
print(f'Duplicate entries: {shopping_df.duplicated().sum()}')

Duplicate entries: 0


In [7]:
# Drop unneccessary data
shopping_df.drop(['CustomerID'], axis=1, inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [8]:
# Transform string column
yes_no = {'Yes':1,'No':0}
shopping_df['Card Member'] = shopping_df['Card Member'].map(yes_no)

In [10]:
# Transform annual income column
shopping_df['Annual Income'] = shopping_df['Annual Income'] / 1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [11]:
# Remove spaces in df
shopping_df.columns = shopping_df.columns.str.replace(' ', '_')
shopping_df.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score_(1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [13]:
# Rename Spending score column
shopping_df.rename(columns = {'Spending_Score_(1-100)': 'Spending_Score'}, inplace=True)
shopping_df.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [14]:
# Save cleaned data
shopping_df.to_csv('../resources/clean_shopping_data.csv', index=False)