## Part B
### Preprocessing of the beacons dataset

In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv('beacons_dataset.csv', delimiter=';')
print(df.shape)

(58633, 4)


1. Correcting the Room labels

In [18]:
from fuzzywuzzy import fuzz

rooms = ['Bedroom','Bathroom', 'Livingroom' , 'Kitchen', 'Box', 'Office', 'DinerRoom']
#print("Unique values of rooms BEFORE fuzzywuzzy\n", df['room'].value_counts())
def fuzz_ratio(data):
        return fuzz.token_sort_ratio(data['room'], rooms[i])

for i in range (0, len(rooms)):
    changes = df[df.apply(fuzz_ratio, axis=1) > 75].room
    df.loc[df['room'].isin(changes), 'room'] = rooms[i]

#print("Unique values of rooms AFTER fuzzywuzzy\n", df['room'].value_counts())

2. Removing wrong part_id of users

In [19]:
# Find the wrong IDs if they are not numeric or they are not 4 digit numbers and drop them
wrong_id = df[df['part_id'].apply(lambda x: not x.isnumeric() or not int(x) > 999 or not int(x) < 10000)]
df.drop(wrong_id.index, axis=0, inplace=True)
print(df.shape)

(46782, 4)


3. Generating Features

- Fixing the ts_date column and merging it with ts_time

In [20]:
df['ts_date'] = pd.to_datetime(df['ts_date'], format = "%Y%m%d").dt.strftime('%Y-%m-%d')
# Generate new column with merged date and time
df['fulldate'] = pd.to_datetime(df['ts_date'] + " " + df['ts_time'])

- Generate new column with the difference between row time

In [21]:
df['diff_dt_seconds'] = df.sort_values(['part_id','fulldate']).groupby('part_id')['fulldate'].diff().dt.total_seconds()

- Remove NaN values, difference that is zero and higher than 3600 (1 hour)

In [22]:
# Filter out difference that is NaN, equal to zero, and higher than 3600
df = df[df['diff_dt_seconds'].notna()]
df = df[df['diff_dt_seconds'] != 0]
df = df[df['diff_dt_seconds'] < 3600]

- Calculate percentage of each user time in a room

In [23]:
total_sec_id = df.groupby(['part_id'])['diff_dt_seconds'].agg('sum')
total_sec_room = df.groupby(['part_id', 'room'])['diff_dt_seconds'].agg('sum')
percent_time_id = total_sec_room/total_sec_id*100

- Create new dataset with part_id and percentage for each room

In [24]:
# Break the percentage_time_id into separate columns
result_column = percent_time_id.unstack().fillna(0)
pre_beacons = pd.DataFrame(result_column)
# Keep only the [Bedroom, Bathroom, Livingroom and Kitchen] columns
pre_beacons = pre_beacons.loc[:, ['Bedroom', 'Bathroom', 'Livingroom', 'Kitchen']]
pre_beacons = pre_beacons.round(1) # round to 1 decimal
pre_beacons = pre_beacons.reset_index()

pre_beacons.to_csv('preprocessed_beacons.csv', index=False)

2. Merge the preprocessed clinical and beacons datasets by id

In [25]:
clinical_df = pd.read_csv('preprocessed_clinical.csv')
beacons_df = pd.read_csv('preprocessed_beacons.csv')

clinical_df['part_id'] = clinical_df['part_id'].astype(int)
beacons_df['part_id'] = beacons_df['part_id'].astype(int)
merged_df = pd.merge(clinical_df, beacons_df, on='part_id', how = 'inner')

merged_df.to_csv('merged.csv', index=False)

3. Clustering

In [26]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, homogeneity_score, v_measure_score, completeness_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

3.1 Initialize K-means, standardize and fit data

In [27]:
kmeans = KMeans(n_clusters=3)

scaler = StandardScaler()
merged_df_std = scaler.fit_transform(merged_df) # Standardize features

kmeans.fit(merged_df_std)
kmeans_labels = kmeans.fit_predict(merged_df_std)

3.2 Evaluation of K-means through metrics

- Silhouette Score

In [28]:
silhouette = silhouette_score(merged_df_std, kmeans_labels)
print("Silhouette Score :", silhouette.round(3))

Silhouette Score : 0.043


- Homogeneity Score

In [29]:
homogeneity = homogeneity_score(merged_df['fried'], kmeans_labels)
print ("Homogeneity  Score :",homogeneity.round(3))

Homogeneity  Score : 0.237


- V_Measure Score

In [30]:
v_measure = v_measure_score(merged_df['fried'], kmeans_labels)
print ("V-measure Score :",v_measure.round(3))

V-measure Score : 0.223


- Completeness Score

In [31]:
com_score = completeness_score(merged_df['fried'], kmeans_labels)
print ("Completeness Score :",com_score.round(3))

Completeness Score : 0.211


3.3 Evaluation of K-means metrics after Principal Component Analysis

In [32]:
for i in range(2, 11):
    pca = PCA(n_components=i)
    pca.fit(merged_df_std)
    scores_pca = pca.transform(merged_df_std)

    print('number of components', i)

    pca_kmeans = KMeans(n_clusters=3)
    pca_kmeans.fit(scores_pca)
    pca_kmeans_labels = pca_kmeans.predict(scores_pca)

    silhouette_pca = silhouette_score(scores_pca, pca_kmeans_labels)
    print("Silhouette Score after PCA :", silhouette_pca.round(3))

    homogeneity_pca = homogeneity_score(merged_df['fried'], pca_kmeans_labels)
    print ("Homogeneity Score after PCA :", homogeneity_pca.round(3))

    v_measure_pca = v_measure_score(merged_df['fried'], pca_kmeans_labels)
    print ("V-Measure Score after PCA :", v_measure_pca.round(3))

    com_score_pca = completeness_score(merged_df['fried'], pca_kmeans_labels)
    print ("Completeness Score after PCA :", com_score_pca.round(3))
    print('\n')

number of components 2
Silhouette Score after PCA : 0.369
Homogeneity Score after PCA : 0.281
V-Measure Score after PCA : 0.266
Completeness Score after PCA : 0.253


number of components 3
Silhouette Score after PCA : 0.282
Homogeneity Score after PCA : 0.246
V-Measure Score after PCA : 0.231
Completeness Score after PCA : 0.218


number of components 4
Silhouette Score after PCA : 0.229
Homogeneity Score after PCA : 0.224
V-Measure Score after PCA : 0.211
Completeness Score after PCA : 0.2


number of components 5
Silhouette Score after PCA : 0.201
Homogeneity Score after PCA : 0.237
V-Measure Score after PCA : 0.223
Completeness Score after PCA : 0.211


number of components 6
Silhouette Score after PCA : 0.181
Homogeneity Score after PCA : 0.255
V-Measure Score after PCA : 0.24
Completeness Score after PCA : 0.227


number of components 7
Silhouette Score after PCA : 0.161
Homogeneity Score after PCA : 0.243
V-Measure Score after PCA : 0.229
Completeness Score after PCA : 0.217


n