## How to Prepare Data for Machine Learning

In [1]:
import pandas as pd

In [2]:
file_path = './iris.csv'
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris_df = iris_df.drop(columns=['class'])
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
output_file_path = "./new_iris_data.csv"
iris_df.to_csv(output_file_path, index=False)

In [5]:
file_path = "./shopping_data.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [6]:
df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [7]:
df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [8]:
df.isnull().sum()

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64

In [9]:
df = df.dropna()
df.isnull().sum()

CustomerID                0
Card Member               0
Age                       0
Annual Income             0
Spending Score (1-100)    0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
df.drop(columns=['CustomerID'], inplace=True)

In [12]:
df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0



Values are scaled. In other words, data has been manipulated to ensure that the variance between the numbers won't skew results.

In [16]:
# Machine Learning only takes in numbers -- convert all strings to numbers 
def change_string(cell):
    if cell == "Yes":
        return 1
    else:
        return 0
    

In [17]:
df["Card Member"] = df['Card Member'].apply(change_string)

In [18]:
df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,0,19.0,15000,39.0
1,0,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


## Scale the Income Score

In [19]:
# Scale any columns so nothing is skewed
df['Annual Income'] = df['Annual Income'] / 1000

In [20]:
# Saving cleaned data
file_path = "./shopping_data_cleaned.csv"
df.to_csv(file_path, index=False)

## How to Run A K-Means Cluster

In [23]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [24]:
file_path = './new_iris_data.csv'
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [28]:
# Use KMeans Cluster, in this example we just choose 3 clusters
model = KMeans(n_clusters=3,random_state=5)
model.fit(iris_df)
predictions = model.predict(iris_df)
iris_df['class'] = model.labels_
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [29]:
iris_df.hvplot.scatter(x='sepal_length',y='sepal_width', by='class')

In [30]:
# Plotting the clusters with three features
fig = px.scatter_3d(iris_df, x="petal_width",
                    y="sepal_length", 
                    z="petal_length", 
                    color="class", 
                    symbol="class", 
                    size="sepal_width",
                    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [32]:
iris_scaled = StandardScaler().fit_transform(iris_df)
print(iris_scaled)

[[-9.00681170e-01  1.03205722e+00 -1.34127240e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.14301691e+00 -1.24957601e-01 -1.34127240e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.38535265e+00  3.37848329e-01 -1.39813811e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.50652052e+00  1.06445364e-01 -1.28440670e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.02184904e+00  1.26346019e+00 -1.34127240e+00 -1.31297673e+00
   1.99833541e-01]
 [-5.37177559e-01  1.95766909e+00 -1.17067529e+00 -1.05003079e+00
   1.99833541e-01]
 [-1.50652052e+00  8.00654259e-01 -1.34127240e+00 -1.18150376e+00
   1.99833541e-01]
 [-1.02184904e+00  8.00654259e-01 -1.28440670e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.74885626e+00 -3.56360566e-01 -1.34127240e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.14301691e+00  1.06445364e-01 -1.28440670e+00 -1.44444970e+00
   1.99833541e-01]
 [-5.37177559e-01  1.49486315e+00 -1.28440670e+00 -1.31297673e+00
   1.99833541e-01]
 [-1.26418478e+00  8.00654259e-01 -1.22754100e+00 -1.31297673e+00

In [33]:
# Use PCA to reduce the number of features
pca = PCA(n_components=2)

In [34]:
iris_pca = pca.fit_transform(iris_scaled)

In [35]:
df_iris_pca = pd.DataFrame(
data=iris_pca,
columns = ['principal component 1', 'principal component 2'])
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.182256,-0.812917
1,-2.051688,0.016021
2,-2.316445,-0.255681
3,-2.263657,-0.06775
4,-2.298328,-0.946665


In [36]:
pca.explained_variance_ratio_

array([0.59174783, 0.28586907])

In [37]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [38]:
# Initialize the K-means model, 3 was the better option
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.182256,-0.812917,0
1,-2.051688,0.016021,0
2,-2.316445,-0.255681,0
3,-2.263657,-0.06775,0
4,-2.298328,-0.946665,0


In [39]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)