In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

# Google Drive 연동
from google.colab import drive
drive.mount('/content/gdrive')

# 데이터 불러오기
data = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/bigdata/city_people_sum.csv")

# 날짜 정보를 활용하여 연도와 월을 하나의 날짜 형식으로 변환
data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))

# 도시별로 모델 학습 및 예측 수행
cities = data['city'].unique()
predictions = {}
accuracies = []

for city in cities:
    city_data = data[data['city'] == city]

    # 데이터가 충분하지 않은 경우, 학습 및 예측을 수행하지 않음
    if len(city_data) < 100:
        print(f"City: {city} - Not enough data for training (Less than 100 samples)")
        continue

    # 2009년 12월부터 2022년 12월까지의 데이터로 학습
    train_data = city_data[(city_data['date'] >= '2009-12-01') & (city_data['date'] <= '2019-12-01')]

    # X (날짜)와 y (여행객 수) 데이터 설정
    X_train = train_data[['year', 'month']]
    y_train = train_data['people']

    # Linear Regression 모델 생성 및 학습
    model = LinearRegression()
    model.fit(X_train, y_train)

    # 2023년 1월부터 7월까지의 여행객 수 예측을 위한 데이터 준비
    forecast_dates = pd.date_range(start='2023-01-01', end='2023-07-01', freq='MS')
    forecast_data = pd.DataFrame({'year': forecast_dates.year, 'month': forecast_dates.month})

    # 2023년 1월부터 7월까지의 여행객 수 예측
    forecast = model.predict(forecast_data)
    forecast = np.where(forecast < 0, 0, forecast)

    # 각 도시와 월 간의 상관관계 계산
correlations = data.groupby(['city', 'month'])['people'].corr(data['people']).reset_index()

for _, row in correlations.iterrows():
    city = row['city']
    month = row['month']
    correlation_value = row['people']

    city_data = data[data['city'] == city]

    # 데이터가 충분하지 않은 경우, 학습 및 예측을 수행하지 않음
    if len(city_data) < 12:
        print(f"City: {city} - Not enough data for training (Less than 12 samples)")
        continue

    # 2009년 12월부터 2022년 12월까지의 데이터로 학습
    train_data = city_data[(city_data['date'] >= '2009-12-01') & (city_data['date'] <= '2022-12-01')]

    # X (날짜)와 y (여행객 수) 데이터 설정
    X_train = train_data[['year', 'month']]
    y_train = train_data['people']

    # Linear Regression 모델 생성 및 학습
    model = LinearRegression()
    model.fit(X_train, y_train)

    # 2023년 1월부터 7월까지의 여행객 수 예측을 위한 데이터 준비
    forecast_dates = pd.date_range(start='2023-01-01', end='2023-07-01', freq='MS')
    forecast_data = pd.DataFrame({'year': forecast_dates.year, 'month': forecast_dates.month})

    # 2023년 1월부터 7월까지의 여행객 수 예측
    forecast = model.predict(forecast_data)
    forecast = np.where(forecast < 0, 0, forecast)

    # 상관관계 값에 따라 예측값 조정
    if correlation_value > 0:
        forecast += correlation_value  # 양의 상관관계인 경우 예측값 더하기
    elif correlation_value < 0:
        forecast -= correlation_value  # 음의 상관관계인 경우 예측값 빼기

    predictions[city] = forecast

    # 해당 도시의 실제 데이터 가져오기
    actual_data = city_data[(city_data['date'] >= '2023-01-01') & (city_data['date'] <= '2023-07-01')]

    # 실제 데이터가 없는 경우를 확인
    if actual_data.empty:
        print(f"City: {city} - No actual data available for this period.")
        continue

    # 예측값과 실제값 출력
    print(f"City: {city}")

    # 예측된 여행객 수와 실제 여행객 수를 저장할 리스트 초기화
    predicted_values = []
    actual_values = []

    for date, forecast_value in zip(forecast_dates, forecast):
        # 실제 데이터에서 해당 날짜의 데이터 찾기
        actual_data_row = actual_data[actual_data['date'] == date]

        if not actual_data_row.empty:
            actual_value = actual_data_row['people'].values[0]
            if actual_value == 0:
              continue
            print(f"Date: {date.strftime('%Y-%m')}, Predicted People: {forecast_value:.2f}, Actual People: {actual_value:.2f}")

            # 예측된 여행객 수와 실제 여행객 수를 리스트에 추가
            predicted_values.append(forecast_value)
            actual_values.append(actual_value)

    if len(predicted_values) == 0:
        continue

    # RMSE 계산 및 정확도 계산
    if np.mean(actual_values) != 0:
        rmse = sqrt(mean_squared_error(actual_values, predicted_values))
        accuracy = 100 * (1 - rmse / np.mean(actual_values))
        accuracies.append(accuracy)

mean_accuracy = np.mean(accuracies)
print(f"Mean Accuracy for All Cities: {mean_accuracy:.2f}%")


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
City: ISA 공군기지 - Not enough data for training (Less than 100 samples)
City: 갈릴레오갈릴레이 - Not enough data for training (Less than 100 samples)
City: 고베 - Not enough data for training (Less than 100 samples)
City: 고찌 - Not enough data for training (Less than 100 samples)
City: 고테버그 - Not enough data for training (Less than 100 samples)
City: 골드코스트 - Not enough data for training (Less than 100 samples)
City: 과달라하라 - Not enough data for training (Less than 100 samples)
City: 구시로 - Not enough data for training (Less than 100 samples)
City: 구이양 - Not enough data for training (Less than 100 samples)
City: 구즈베이 - Not enough data for training (Less than 100 samples)
City: 그린빌-스파르탄버그 - Not enough data for training (Less than 100 samples)
City: 김포 - Not enough data for training (Less than 100 samples)
City: 김해 - Not enough data for training (Less than 100 samples)
City:

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.t

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Date: 2023-01, Predicted People: 14145.51, Actual People: 13447.00
Date: 2023-02, Predicted People: 14221.51, Actual People: 9292.00
Date: 2023-03, Predicted People: 14297.50, Actual People: 11533.00
Date: 2023-04, Predicted People: 14373.50, Actual People: 14304.00
Date: 2023-05, Predicted People: 14449.50, Actual People: 15031.00
Date: 2023-06, Predicted People: 14525.50, Actual People: 14717.00
Date: 2023-07, Predicted People: 14601.50, Actual People: 15526.00
City: 워싱턴 둘리스
Date: 2023-01, Predicted People: 14145.51, Actual People: 13447.00
Date: 2023-02, Predicted People: 14221.51, Actual People: 9292.00
Date: 2023-03, Predicted People: 14297.50, Actual People: 11533.00
Date: 2023-04, Predicted People: 14373.50, Actual People: 14304.00
Date: 2023-05, Predicted People: 14449.50, Actual People: 15031.00
Date: 2023-06, Predicted People: 14525.50, Actual People: 14717.00
Date: 2023-07, Predicted People: 14601.50, Actual People: 15526.00


In [5]:
# !apt-get install -y build-essential python-dev python3-dev
# !pip install --upgrade pystan
!pip install fbprophet

Collecting fbprophet
  Using cached fbprophet-0.7.1.tar.gz (64 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cmdstanpy==0.9.5 (from fbprophet)
  Using cached cmdstanpy-0.9.5-py3-none-any.whl (37 kB)
Collecting setuptools-git>=1.2 (from fbprophet)
  Using cached setuptools_git-1.2-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: fbprophet
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for fbprophet (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for fbprophet[0m[31m
[0m[?25h  Running setup.py clean for fbprophet
Failed to build fbprophet
[31mERROR: Could not build wheels for fbprophet, which is required to install pyproject.toml-based projects[0m[3