# 피트니스 데이터 KMeans-Clustering

## 빅데이터

### 이영석, 문현수

#### munhyunsu@cs-cnu.org

#### 데이터 경로 찾기

In [None]:
import os

In [None]:
file_list = []
ext = '.json'

queue = [os.path.abspath(os.path.expanduser('./sokulee')),
        ]

counter = 0
while queue:
    cursor = queue.pop()
    counter = counter + 1
    with os.scandir(cursor) as it:
        for entry in it:
            if entry.is_dir():
                queue.append(entry.path)
            elif entry.is_file() and entry.path.endswith(ext):
                file_list.append(entry.path)

print(f'{counter}개의 디렉터리에서 {len(file_list)}개 {ext} 파일 발견')
for path in file_list[:10]:
    print(path)

#### 데이터 살펴보기

In [None]:
import datetime
import json

import numpy as np
import pandas as pd

In [None]:
path = '/home/harny/Github/CNU2021-Bigdata/Week06-Kmeans/sokulee/A07/A07_20160405_sleep.json'
with open(path, 'r') as f:
    data = json.load(f)
data

In [None]:
path = '/home/harny/Github/CNU2021-Bigdata/Week06-Kmeans/sokulee/A07/A07_20160513_steps.json'
with open(path, 'r') as f:
    data = json.load(f)
data

#### json raw data to DataFrame

In [None]:
counter1 = 0
counter2 = 0
steps = []
for path in file_list:
    if 'steps' not in path:
        continue
    counter1 = counter1 + 1
    with open(path, 'r') as f:
        data = json.load(f)
        user = os.path.basename(path).split('_')[0]
        if 'activities-steps' not in data:
#             print(f'처리 불가: {path}')
            continue
        date = datetime.datetime.fromisoformat(data['activities-steps'][0]['dateTime'])
        value = 0
        for row in data['activities-steps-intraday']['dataset']:
            value = value + row['value']
        steps.append({'user': user,
                      'date': date,
                      'steps': value})
    counter2 = counter2 + 1
df_steps = pd.DataFrame(steps)
print(f'{counter1}개 파일 중 {counter2}개 입력됨')
df_steps

In [None]:
counter1 = 0
counter2 = 0
sleeps = []
for path in file_list:
    if 'sleep' not in path:
        continue
    counter1 = counter1 + 1
    with open(path, 'r') as f:
        data = json.load(f)
        user = os.path.basename(path).split('_')[0]
        if len(data['sleep']) < 1:
#             print(f'처리 불가: {path}')
            continue
        for i in range(len(data['sleep'])):
            date = datetime.datetime.fromisoformat(data['sleep'][i]['dateOfSleep'])
            sleep_start = datetime.datetime.fromisoformat(f'{data["sleep"][i]["startTime"]}+09:00')
            sleep_duration = datetime.timedelta(milliseconds=data['sleep'][i]['duration'])
            wakeup = sleep_start + sleep_duration
            sleeps.append({'user': user,
                           'date': date,
                           'wakeup': wakeup})
#             break
    counter2 = counter2 + 1
df_sleeps = pd.DataFrame(sleeps)
print(f'{counter1}개 파일 중 {counter2}개 입력됨')
df_sleeps

#### 사용자, 날짜 기준 DataFrame 합치기

In [None]:
df = df_steps.merge(df_sleeps, on=['user', 'date'])
df

In [None]:
# Nan 데이터가 있는 행을 제거
df = df[df.notna().all(axis=1)]
df.reset_index(drop=True)
df

#### 클러스터링 데이터 준비

In [None]:
df['hour'] = df['wakeup'].dt.hour
df['sk'] = np.round(df['steps']/10**3)
df = df.astype({'sk': 'int'})
df

In [None]:
X = df.loc[:, ['hour', 'sk']]
X

#### 데이터 살펴보기: 시각화

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig = plt.figure(figsize= (8*1, 8*1))
ax = fig.add_subplot()
ax.scatter(df['hour'], df['sk'])
ax.tick_params(labelsize='large')
ax.set_xlabel('Wakeup hour (H)', fontsize='large')
ax.set_ylabel('Steps (K)', fontsize='large')
_ = ax.set_title('Wakeup hour and steps scatter chart', fontsize='x-large')

#### 클러스터링

In [None]:
from sklearn.cluster import KMeans

In [None]:
n_clusters = 4
random_state = None
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
kmeans

In [None]:
kmeans.fit(X)

In [None]:
kmeans.labels_

In [None]:
kmeans.cluster_centers_

In [None]:
fig = plt.figure(figsize=(8*1, 8*1))
ax = fig.add_subplot()
ax.scatter(df['hour'], df['sk'], c=kmeans.labels_)
ax.tick_params(labelsize='large')
ax.set_xlabel('Wakeup hour (H)', fontsize='large')
ax.set_ylabel('Steps (K)', fontsize='large')
_ = ax.set_title('Wakeup hour and steps scatter chart', fontsize='x-large')


#### 관성 (inertia) 을 통한 최적의 K 찾기

In [None]:
rows = 2
columns = 2
n_clusters = 2
random_state = None

fig = plt.figure(figsize=(8*1, 8*1))
fig.set_facecolor('white')
counter = 1
for r in range(rows):
    for c in range(columns):
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        kmeans.fit(X)
        ax = fig.add_subplot(rows, columns, counter)
        ax.scatter(df['hour'], df['sk'], c=kmeans.labels_)
        ax.tick_params(axis='x', which='both', bottom=False, labelbottom=False)
        ax.tick_params(axis='y', which='both', left=False, labelleft=False)
        ax.set_title(f'{n_clusters=}', fontsize='x-large')
        n_clusters = n_clusters + 1
        counter = counter + 1

In [None]:
rows = 2
columns = 2
n_clusters = 2
random_state = None

fig = plt.figure(figsize=(8*1, 6*1))
fig.set_facecolor('white')
counter = 1
x_data = []
y_data = []
for r in range(rows):
    for c in range(columns):
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        kmeans.fit(X)
        x_data.append(n_clusters)
        y_data.append(kmeans.inertia_)
        n_clusters = n_clusters + 1
        counter = counter + 1
ax = fig.add_subplot()
ax.plot(x_data, y_data)
ax.tick_params(labelsize='large')
ax.set_xlabel('n_clusters', fontsize='large')
ax.set_ylabel('inertia', fontsize='large')
_ = ax.set_title('K-means inertia by n_clusters', fontsize='x-large')