In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
# !pip install pmdarima
# !pip install prophet

import os
import pickle

import pandas as pd
import pandas_datareader.data as pdr

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
plt.style.use('seaborn-whitegrid')
%matplotlib inline

import pmdarima as pm
from pmdarima.arima import auto_arima

from prophet import Prophet

import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from datetime import datetime
from matplotlib.dates import MonthLocator, DateFormatter

# Google Drive 연동
from google.colab import drive
drive.mount('/content/gdrive')

plt.rc('font', family='NanumBarunGothic')

# 데이터 불러오기
data = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/bigdata/remove_city_under_5000_2.csv")

# 날짜 정보를 활용하여 연도와 월을 하나의 날짜 형식으로 변환
data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))

data = data[['city', 'date', 'people']]
data.set_index('date', inplace=True)

train_data = data[['city', 'people']]

cities = data['city'].unique()

# 각 도시의 모델을 저장할 딕셔너리 생성
city_models = {}

# 빈 DataFrame 생성
result_df = pd.DataFrame(columns=['City', 'Date', 'People'])

In [None]:
for city in tqdm(cities):
  city_data = train_data[train_data['city'] == city]['people']

  if len(city_data) < 60 : continue

  city_data_diff = city_data.diff().dropna()

  city_data.plot()
  city_data_diff.plot()

  current_date = datetime.now()
  year = int(str(current_date).split("-")[0])
  month = int(str(current_date).split("-")[1])

  end_year = year + (month+5)//12
  end_month = 12 if (month+6)%12 == 0 else (month+6)%12

  start_dt = datetime(year, month, 1)
  end_dt = datetime(end_year, end_month, 1)

  date_range = pd.date_range(start=start_dt, end=end_dt, freq='MS')

  loaded_model = city_models[city]

  res = sm.tsa.statespace.SARIMAX(city_data, order=(0, 1, 0), seasonal_order=(2, 1, 1, 12), enforce_stationarity=True, enforce_invertibility=True).fit()
  prediction = res.get_forecast(steps=len(date_range))

  # 모델 저장
  city_models[city] = res

  predicted_value = prediction.predicted_mean

  # 최근 12개월 평균 여행객 수 계산
  recent_12_months_avg = city_data.tail(12).mean()

  # 예측값을 최소 0, 최대 최근 12개월 평균의 3배로 클리핑
  predicted_value_clipped = np.clip(predicted_value, recent_12_months_avg*0.1, recent_12_months_avg * 5)

  # 결과를 DataFrame에 추가
  city_result = pd.DataFrame({
      'City': city,
      'Date': date_range,
      'People': predicted_value_clipped
  })

  result_df = result_df.append(city_result, ignore_index=True)

result_df['People'] = result_df['People'].astype(int)

# 딕셔너리를 파일로 저장
with open('/content/gdrive/MyDrive/Colab Notebooks/bigdata/models/city_models.pkl', 'wb') as models_file:
    pickle.dump(city_models, models_file)

# 결과를 CSV 파일로 저장
result_df.to_csv('/content/gdrive/MyDrive/Colab Notebooks/bigdata/predicted_travelers.csv', index=False, encoding="utf-8-sig")