In [2]:
import pandas as pd

In [3]:
# Data Loading
file_path = "Resources/shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


# Data Selection

In [4]:
# What data is available?
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [5]:
# What type of data is available?
# List dataframe data types
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [7]:
# What data is missing?
# ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [8]:
# Find null values
for column in df_shopping.columns:
    print(f"Column{column}has{df_shopping[column].isnull().sum()}null values")

ColumnCustomerIDhas0null values
ColumnCard Memberhas2null values
ColumnAgehas2null values
ColumnAnnual Incomehas0null values
ColumnSpending Score (1-100)has1null values


In [10]:
# What data can be removed?
# Drop null rows
df_shopping = df_shopping.dropna()

In [11]:
# Find duplicate entries
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [14]:
# Remove the CustomerID Column
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


# Data Processing

In [15]:
# Is the data in a format that can be passed into an unsupervised learning model?
# Transform String column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
    
df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [19]:
# Transform annual income
df_shopping["Annual Income"]= df_shopping["Annual Income"]/1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,0.015,39.0
1,1,21.0,0.015,81.0
2,0,20.0,0.016,6.0
3,0,23.0,0.016,77.0
4,0,31.0,0.017,40.0


# Data Transformation

In [22]:
# Can I quickly hand off this data for others to use?
# Saving cleaned data
file_path = "shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)