# 피트니스 데이터 선형회귀

## 빅데이터

### 이영석, 문현수

#### munhyunsu@cs-cnu.org

#### 데이터 경로 찾기

In [None]:
import os

In [None]:
file_list = []
ext = '.json'

queue = [os.path.abspath(os.path.expanduser('./sokulee')),
        ]

counter = 0
while queue:
    cursor = queue.pop()
    counter = counter + 1
    with os.scandir(cursor) as it:
        for entry in it:
            if entry.is_dir():
                queue.append(entry.path)
            elif entry.is_file() and entry.path.endswith(ext):
                file_list.append(entry.path)

print(f'{counter}개의 디렉터리에서 {len(file_list)}개 {ext} 파일 발견')
for path in file_list[:3]:
    print(path)

#### json raw data to DataFrame

In [None]:
import datetime
import json

import numpy as np
import pandas as pd

In [None]:
df_steps = pd.DataFrame()
counter1 = 0
counter2 = 0
for path in file_list:
    if 'steps' not in path:
        continue
    counter1 = counter1 + 1
    steps = []
    with open(path, 'r') as f:
        data = json.load(f)
        user = os.path.basename(path).split('_')[0]
        if 'activities-steps' not in data:
            print(f'처리 불가: {path}')
            continue
        day = data['activities-steps'][0]['dateTime']
        for row in data['activities-steps-intraday']['dataset']:
            datetime_str = f'{day}T{row["time"]}+09:00'
            datetime_iso = datetime.datetime.fromisoformat(datetime_str)
            value = row['value']
            steps.append({'user': user,
                          'datetime': datetime_iso,
                          'steps': value})
    counter2 = counter2 + 1
    df_steps = df_steps.append(steps, ignore_index=True)
print(f'{counter1}개 파일 중 {counter2}개 입력됨')
df_steps

#### 기초 정보: 기간, 참여 인원

In [None]:
print(f'기간: {min(df_steps["datetime"])} ~ {max(df_steps["datetime"])}')
print(f'참여 인원: {len(df_steps["user"].unique())}')

#### 사용자별 1일 발걸음 평균

In [None]:
daily_steps = df_steps.groupby(['user', df_steps['datetime'].dt.strftime('%Y-%m-%d')]).sum()
daily_steps

In [None]:
daily_steps.mean(level='user')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.hist(daily_steps.mean(level='user'))
ax.tick_params(labelsize='large')
ax.set_xlabel('Daily steps', fontsize='large')
ax.set_ylabel('Count', fontsize='large')
_ = ax.set_title('Daily steps by user histogram', fontsize='x-large')

#### 사용자, 일일 발걸음 분석

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
cdf = ax.hist(daily_steps, bins=max(daily_steps.values)[0]+1, cumulative=True, histtype='step', density=True)
ax.axvline(x=10000, color='red', linestyle='--')
ax.set_xlim((-200, 20200))
ax.set_ylim((-0.01, 1.01))
ax.tick_params(labelsize='large')
ax.set_xlabel('Daily steps', fontsize='large')
ax.set_ylabel('CDF', fontsize='large')
_ = ax.set_title('Daily steps CDF', fontsize='x-large')

#### 만보 달성 사용자

In [None]:
goal = daily_steps[daily_steps['steps'] >= 10000].count(level='datetime')
goal.rename(columns={'steps': 'count'}, inplace=True)
goal.head()

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.plot(goal, color='green')
ax.axhline(y=len(df_steps["user"].unique())//2, color='red', linestyle='--')
ax.tick_params(axis='x', labelrotation=90, labelsize='medium')
ax.tick_params(axis='y', labelsize='large')
ax.set_xlabel('Day', fontsize='large')
ax.set_ylabel('Count', fontsize='large')
_ = ax.set_title('Daily goal achievement count', fontsize='x-large')

#### 선형 회귀

In [None]:
X = np.expand_dims(np.arange(0, len(goal)), axis=1)
X

In [None]:
y = goal['count']
y.head()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X, y)
r_square = lr.score(X, y)

In [None]:
print(f'결정 계수: {r_square}')
print(f'기울기: {lr.coef_}')
print(f'y절편: {lr.intercept_}')

In [None]:
y_predict = lr.predict(X)

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.plot(X, y, color='green', marker='o', linestyle='None', label='Actual')
ax.plot(X, y_predict, color='blue', label='Predict')
ax.tick_params(axis='x', labelsize='medium')
ax.tick_params(axis='y', labelsize='large')
ax.set_xlabel('Day', fontsize='large')
ax.set_ylabel('Count', fontsize='large')
ax.set_title('Daily goal achievement count', fontsize='x-large')
_ = ax.legend()

#### (오버 피팅 주의) 다항 회귀

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [None]:
degree = 3
pr = make_pipeline(PolynomialFeatures(degree), LinearRegression())
pr.fit(X, y)
r_square = pr.score(X, y)

In [None]:
print(f'결정 계수: {r_square}')
print(f'차수: {pr[0].get_feature_names()}')
print(f'기울기: {pr[1].coef_}')
print(f'y절편: {pr[1].intercept_}')

In [None]:
y_predict = pr.predict(X)

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.plot(X, y, color='green', marker='o', linestyle='None', label='Actual')
ax.plot(X, y_predict, color='blue', label='Predict')
ax.tick_params(axis='x', labelsize='medium')
ax.tick_params(axis='y', labelsize='large')
ax.set_xlabel('Day', fontsize='large')
ax.set_ylabel('Count', fontsize='large')
ax.set_title('Linear regression', fontsize='x-large')
_ = ax.legend()

In [None]:
fig = plt.figure(figsize= (8, 6))
degree_and_color = [(1, 'blue'),
                    (2, 'purple'),
                    (3, 'orange'),
                    (4, 'pink'),
                    (5, 'brown'),
                    (6, 'cyan'),
                    (7, 'olive'),]
ax = fig.add_subplot()
ax.plot(X, y, color='green', marker='o', linestyle='None', label='Actual')
for degree, color in degree_and_color:
    pr = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    pr.fit(X, y)
    y_predict = pr.predict(X)
    r_square = pr.score(X, y)
    print(f'결정 계수: {r_square}')
    print(f'차수: {pr[0].get_feature_names()}')
    print(f'기울기: {pr[1].coef_}')
    print(f'y절편: {pr[1].intercept_}')
    ax.plot(X, y_predict, color=color, label=f'Degree {degree}')
ax.tick_params(axis='x', labelsize='medium')
ax.tick_params(axis='y', labelsize='large')
ax.set_xlabel('Day', fontsize='large')
ax.set_ylabel('Count', fontsize='large')
ax.set_title('Polynomial regression', fontsize='x-large')
_ = ax.legend()