In [26]:
import pandas as pd

In [27]:
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [28]:
# Drop the class field.
new_iris_df = iris_df.drop(['class'],axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [29]:
# Reorder columns
new_iris_df = new_iris_df[['petal_length','sepal_length','petal_width','sepal_width']]
new_iris_df

Unnamed: 0,petal_length,sepal_length,petal_width,sepal_width
0,1.4,5.1,0.2,3.5
1,1.4,4.9,0.2,3.0
2,1.3,4.7,0.2,3.2
3,1.5,4.6,0.2,3.1
4,1.4,5.0,0.2,3.6
...,...,...,...,...
145,5.2,6.7,2.3,3.0
146,5.0,6.3,1.9,2.5
147,5.2,6.5,2.0,3.0
148,5.4,6.2,2.3,3.4


In [30]:
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

## 18.2.3 Preprocessing Data with Pandas

In [31]:
# Data Loading 
file_path = "Resources/shopping_data.csv"
df_shopping = pd.read_csv(file_path,encoding="ISO-8859-1")
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


## 18.2.4 Data Selection

In [32]:
# What data is available?
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [33]:
# List dataframe data types
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [34]:
# Find null values
for column in df_shopping.columns:
        print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [35]:
# Drop null rows
df_shopping = df_shopping.dropna()

In [36]:
# Find duplicate entries
print(f" Duplicate entries: {df_shopping.duplicated().sum()}")

 Duplicate entries: 0


In [37]:
# Remove the CustomerID Column
df_shopping.drop(columns=["CustomerID"],inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [39]:
# Transform String Column 
def change_string(member):
    if member == "Yes":
        return 1 
    else:
        return 0

df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [40]:
# Saving cleaned data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)