This notebook uses StandardScaler and applies data to Kmeans

In [12]:
# Import the modules
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline

---

In [13]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
#file_path = Path("CTG3.csv")

file_path = Path("data_preprocessed.csv")
#file_path = Path("data_normal_path.csv")
df = pd.read_csv(file_path)

# Review the DataFrame
df.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,MLTV,NSP
0,120,0,0,0,0,0,0,73,0.5,43,2.4,2
1,132,4,0,4,2,0,0,17,2.1,0,10.4,1
2,133,2,0,5,2,0,0,16,2.1,0,13.4,1
3,134,2,0,6,2,0,0,16,2.4,0,23.0,1
4,132,4,0,5,0,0,0,16,2.4,0,19.9,1


In [14]:
# Select relevant data and apply StandardScaler
scaled_data = StandardScaler().fit_transform(df[["LB", "ASTV", "MSTV", "ALTV"]])
scaled_df = pd.DataFrame(scaled_data, columns=["LB", "ASTV", "MSTV", "ALTV"])
scaled_df["NSP"] = df["NSP"]
scaled_df

Unnamed: 0,LB,ASTV,MSTV,ALTV,NSP
0,-1.351659,1.512264,-0.942778,1.801810,2
1,-0.131892,-1.745392,0.868481,-0.535233,1
2,-0.030245,-1.803564,0.868481,-0.535233,1
3,0.071402,-1.803564,1.208092,-0.535233,1
4,-0.131892,-1.803564,1.208092,-0.535233,1
...,...,...,...,...,...
2119,0.681285,1.861298,-1.282389,0.823513,2
2120,0.681285,1.803126,-1.055982,0.660463,2
2121,0.681285,1.861298,-1.055982,0.551764,2
2122,0.681285,1.803126,-1.055982,0.932213,2


In [15]:
# Create and initialise the K-means model instance for 2 clusters
model = KMeans(n_clusters=3, random_state=1)

# Print the model
model

In [16]:
#model.fit(df)
model.fit(scaled_df)



In [17]:
#predictions = model.predict(df)
predictions = model.predict(scaled_df)

In [18]:
# Create a copy of the DataFrame
#predictions_df = df.copy()
predictions_df = scaled_df.copy()

# Add a column to the DataFrame that contains the customer_ratings information
predictions_df['prediction'] = predictions

# Review the DataFrame
predictions_df.head()

Unnamed: 0,LB,ASTV,MSTV,ALTV,NSP,prediction
0,-1.351659,1.512264,-0.942778,1.80181,2,2
1,-0.131892,-1.745392,0.868481,-0.535233,1,1
2,-0.030245,-1.803564,0.868481,-0.535233,1,1
3,0.071402,-1.803564,1.208092,-0.535233,1,1
4,-0.131892,-1.803564,1.208092,-0.535233,1,1


In [19]:
# Plot the data points based on the predictions
predictions_df.hvplot.scatter(
    x="ASTV", 
    y="MSTV", 
    by="NSP",
    xlabel="Abnormal Short Term Variability (%time)",
    ylabel="Average Short Term Variability"
)

In [20]:
# Plot the data points based on the predictions
predictions_df.hvplot.scatter(
    x="ASTV", 
    y="MSTV", 
    by="prediction",
    xlabel="Abnormal Short Term Variability (%time)",
    ylabel="Average Short Term Variability"
)