In [22]:
import sys
import time
import pandas as pd
sys.path.append('../')
from sklearn.cluster import KMeans
from functions.clustering import perform_clustering, elbow_method, silhouette_method

In [23]:
# read data
data = pd.read_pickle('../data/clustering_input/clustering_df_daily_categories.pkl')

In [24]:
# prepare data for clustering (store and then remove id)
# user_id = data['id']
dates = data['date']
data.drop(columns=['date'], inplace=True)

In [25]:
from scipy import stats
import numpy as np

def replace_outliers(group):
    for column in group.select_dtypes(include=[np.number]).columns:
        median_value = group[column].median()
        z_scores = stats.zscore(group[column])
        condition = (z_scores > 3) | (z_scores < -3)
        group.loc[condition, column] = median_value
    return group

data = data.groupby('id').apply(replace_outliers).reset_index(drop=True)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  data = data.groupby('id').apply(replace_outliers).reset_index(drop=True)


In [26]:
data

Unnamed: 0,id,exertion_points,step_goal,minutes_below_zone_1,minutes_in_zone_1,steps,very_active_minutes,minutes_in_zone_2,minutes_in_zone_3,altitude,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,exercise,exercise_duration,sleep_points,sleep_duration,calories
0,621e2e8e67b776a24055b564,0.623032,0.00000,0.936806,0.081134,0.204885,0.080685,0.000000,0.000000,0.065476,0.254701,0.083045,0.495139,0.222222,0.005592,0.809989,0.0,0.280325
1,621e2e8e67b776a24055b564,0.681027,0.59996,0.954167,0.054741,0.225622,0.075795,0.012780,0.000000,0.074405,0.225641,0.086505,0.488889,0.222222,0.005428,0.809989,0.0,0.277999
2,621e2e8e67b776a24055b564,0.681027,0.59996,0.954167,0.054741,0.225622,0.075795,0.012780,0.000000,0.074405,0.225641,0.086505,0.488889,0.222222,0.005428,0.809989,0.0,0.277999
3,621e2e8e67b776a24055b564,0.681027,0.00000,0.937500,0.083089,0.191432,0.075795,0.000000,0.000000,0.089286,0.191453,0.093426,0.493056,0.222222,0.005345,0.961998,0.0,0.269678
4,621e2e8e67b776a24055b564,0.681027,0.59996,0.890278,0.087977,0.209107,0.090465,0.000000,0.000000,0.071429,0.227350,0.072664,0.431944,0.222222,0.005592,0.923996,0.0,0.277166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,621e375b67b776a240290cdc,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455
8288,621e375b67b776a240290cdc,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455
8289,621e375b67b776a240290cdc,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455
8290,621e375b67b776a240290cdc,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455


In [27]:
# prepare data for clustering (store and then remove id)
user_id = data['id']
data.drop(columns=['id'], inplace=True)

In [28]:
# perform k-means clustering
start = time.time()
print("Clustering with K-means ... ")
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto")
metadata = pd.concat([user_id, dates], axis=1)
results = perform_clustering(kmeans, data, metadata)
# drop rows that contain nan
results = results.dropna()
print("K-means finished after", time.time() - start)
results.to_csv('../data/clustering_results/kmeans_2_results_daily_categories_outliers.csv', index=False)

Clustering with K-means ... 
K-means finished after 0.020145893096923828


In [29]:
results

Unnamed: 0,id,date,exertion_points,step_goal,minutes_below_zone_1,minutes_in_zone_1,steps,very_active_minutes,minutes_in_zone_2,minutes_in_zone_3,altitude,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,exercise,exercise_duration,sleep_points,sleep_duration,calories,cluster
0,621e2e8e67b776a24055b564,2021-05-24,0.623032,0.00000,0.936806,0.081134,0.204885,0.080685,0.000000,0.000000,0.065476,0.254701,0.083045,0.495139,0.222222,0.005592,0.809989,0.0,0.280325,1.0
1,621e2e8e67b776a24055b564,2021-05-25,0.681027,0.59996,0.954167,0.054741,0.225622,0.075795,0.012780,0.000000,0.074405,0.225641,0.086505,0.488889,0.222222,0.005428,0.809989,0.0,0.277999,1.0
2,621e2e8e67b776a24055b564,2021-05-25,0.681027,0.59996,0.954167,0.054741,0.225622,0.075795,0.012780,0.000000,0.074405,0.225641,0.086505,0.488889,0.222222,0.005428,0.809989,0.0,0.277999,1.0
3,621e2e8e67b776a24055b564,2021-05-26,0.681027,0.00000,0.937500,0.083089,0.191432,0.075795,0.000000,0.000000,0.089286,0.191453,0.093426,0.493056,0.222222,0.005345,0.961998,0.0,0.269678,1.0
4,621e2e8e67b776a24055b564,2021-05-27,0.681027,0.59996,0.890278,0.087977,0.209107,0.090465,0.000000,0.000000,0.071429,0.227350,0.072664,0.431944,0.222222,0.005592,0.923996,0.0,0.277166,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,621e375b67b776a240290cdc,2022-01-01,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455,0.0
8288,621e375b67b776a240290cdc,2022-01-02,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455,0.0
8289,621e375b67b776a240290cdc,2022-01-03,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455,0.0
8290,621e375b67b776a240290cdc,2022-01-04,0.724307,0.00000,0.736265,0.270419,0.264675,0.000000,0.025248,0.000112,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.681236,0.0,0.154455,0.0
