In [1]:
import pandas as pd
import dask.dataframe as dd

file_list = ['2210.csv', '2211.csv', '2212.csv', '2301.csv', '2302.csv', '2303.csv', '2304.csv', '2305.csv', '2306.csv']

# Dask
df = dd.concat([dd.read_csv(file) for file in file_list])
df = df.compute()  # Dask Lazy Evaluation을 위해 compute()를 사용합니다.

In [4]:
df = df.loc[:, ['stn_id', 'borrowed_hour', 'borrowed_num', 'is_holiday']]
df['stn_id'].value_counts()

ST-702     6552
ST-1440    6552
ST-1714    6552
ST-1061    6552
ST-455     6552
           ... 
ST-383      411
ST-2038     400
ST-264      297
ST-2494       1
ST-1029       1
Name: stn_id, Length: 2758, dtype: int64

In [6]:
# !pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/85/15/cf99a373812d37f8ae99752a34a9f5f690d820ceb5b302e922705bc18944/tensorflow-2.15.0-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading tensorflow-2.15.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.6 kB)
Collecting tensorflow-macos==2.15.0 (from tensorflow)
  Obtaining dependency information for tensorflow-macos==2.15.0 from https://files.pythonhosted.org/packages/eb/9f/0759e2fea4a3c48f070b64811c2c57036b46353ba87263afc810b8f4188a/tensorflow_macos-2.15.0-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading tensorflow_macos-2.15.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.2 kB)
Collecting absl-py>=1.0.0 (from tensorflow-macos==2.15.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downlo

  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow-macos==2.15.0->tensorflow)
  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow-macos==2.15.0->tensorflow)
  Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.7/151.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow-2.15.0-cp311-cp311-macosx_12_0_arm64.whl (2.1 kB)
Downloading tensorflow_macos-2.15.0-cp311-cp311-macosx_12_0_arm64.whl (208.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.8/208.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading absl_py-2.0.0-py3-none-any.whl (130 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# 대여소 ID를 정수로 변환
le = LabelEncoder()
df['stn_id'] = le.fit_transform(df['stn_id'])

# 데이터를 훈련 세트와 테스트 세트로 분할
X = df.drop('borrowed_num', axis=1)
y = df['borrowed_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
import tensorflow as tf


# 임베딩 차원
embedding_dim = 50

# 대여소 ID의 최대 값
max_id = df['stn_id'].max()

# 모델 구조 정의
stn_id_input = Input(shape=(1,), name='stn_id')
x = Embedding(input_dim=max_id+1, output_dim=embedding_dim)(stn_id_input)

borrowed_hour_input = Input(shape=(1,), name='borrowed_hour')
y = Embedding(input_dim=24, output_dim=embedding_dim)(borrowed_hour_input)

is_holiday_input = Input(shape=(1,), name='is_holiday')

x = LSTM(50)(x)
y = LSTM(50)(y)

combined = tf.keras.layers.concatenate([x, y, is_holiday_input])

output = Dense(1)(combined)

model = Model(inputs=[stn_id_input, borrowed_hour_input, is_holiday_input], outputs=output)

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
model.fit([X_train['stn_id'], X_train['borrowed_hour'], X_train['is_holiday']], y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x141d59490>

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 예측값 생성
y_pred = model.predict([X_test['stn_id'], X_test['borrowed_hour'], X_test['is_holiday']])

# 성능 평가
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('Root Mean Squared Error:', rmse)
print('R^2 Score:', r2)

Mean Squared Error: 8.471384592020955
Mean Absolute Error: 1.6644945659407109
Root Mean Squared Error: 2.9105643081747834
R^2 Score: 0.31372730327621856


In [None]:
# from sklearn.linear_model import SGDRegressor
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.utils import shuffle

# regressor = SGDRegressor(penalty='l1')

# for file in file_list:
#     df = pd.read_csv(file)
#     df = df.loc[:, ['stn_id', 'borrowed_hour', 'borrowed_num', 'is_holiday']]
    
#     print(f"{file} one-hot encoding 시작")
    
#     enc = OneHotEncoder()
#     enc_df = pd.DataFrame(enc.fit_transform(df[['stn_id']]).toarray())
#     df = df.join(enc_df)
#     df = df.drop('stn_id', axis=1)

#     X = df.drop('borrowed_num', axis=1)
#     y = df['borrowed_num']
    
#     print(f"{file} 데이터 섞기 시작")

#     # 데이터를 섞습니다.
#     X, y = shuffle(X, y, random_state=42)

#     # 80%는 학습 데이터, 20%는 검증 데이터로 분할
#     train_size = int(0.8 * len(X))
#     X_train, X_val = X[:train_size], X[train_size:]
#     y_train, y_val = y[:train_size], y[train_size:]
    
#     print(f"{file} 데이터 학습 시작")

#     # partial_fit 메소드를 통해 배치 학습을 수행합니다.
#     regressor.partial_fit(X_train, y_train)

#     # 예측값 계산
#     y_pred = regressor.predict(X_val)

#     # 성능 평가 지표 계산 및 출력
#     mse = mean_squared_error(y_val, y_pred)
#     mae = mean_absolute_error(y_val, y_pred)
#     r2 = r2_score(y_val, y_pred)

#     print(f'File: {file}')
#     print(f'MSE: {mse}')
#     print(f'MAE: {mae}')
#     print(f'R^2: {r2}\n')


In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import Lasso
# from sklearn import preprocessing
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform

# enc = OneHotEncoder()
# enc_df = pd.DataFrame(enc.fit_transform(df[['stn_id']]).toarray())
# df = df.join(enc_df)
# df = df.drop('stn_id', axis=1)

# X = df.drop('borrowed_num', axis=1)
# y = df['borrowed_num']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# parameters = {'alpha': uniform()}
# lasso = Lasso()
# lasso_regressor = RandomizedSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=3, n_iter=100)
# lasso_regressor.fit(X_train, y_train)

# print("Best parameters: ", lasso_regressor.best_params_)
# print("Best score: ", lasso_regressor.best_score_)

In [None]:
# from sklearn.model_selection import train_test_split

# # 'stn_id'를 제외한 특성을 선택.
# features = ['borrowed_hour', 'is_holiday']
# target = 'borrowed_num'

# X = df[features]
# y = df[target]

# # train, test set을 나눕니다.
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # train set을 다시 train, validation set으로 나눕니다.
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# from sklearn.linear_model import Lasso
# from sklearn.model_selection import GridSearchCV

# # Lasso 모델을 정의합니다.
# model = Lasso()

# # 탐색할 하이퍼파라미터의 범위를 정의합니다.
# param_grid = {'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]}

# # GridSearchCV를 사용하여 최적의 하이퍼파라미터를 찾습니다.
# grid_search = GridSearchCV(model, param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# # 최적의 하이퍼파라미터를 출력합니다.
# print('Best parameters: ', grid_search.best_params_)

# # 최적의 하이퍼파라미터로 학습된 모델로 validation set에 대한 성능을 평가합니다.
# print('Validation Score: ', grid_search.score(X_val, y_val))

In [None]:
# from sklearn.metrics import mean_absolute_error, mean_squared_error

# # 최적의 하이퍼파라미터로 학습된 모델로 test set을 예측합니다.
# y_pred = grid_search.predict(X_test)

# # MAE를 계산합니다.
# mae = mean_absolute_error(y_test, y_pred)
# print('Mean Absolute Error: ', mae)

# # MSE를 계산합니다.
# mse = mean_squared_error(y_test, y_pred)
# print('Mean Squared Error: ', mse)