In [1]:
!pip install xgboost



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

In [5]:
df1 = pd.read_csv('/content/drive/MyDrive/새만금/새만금개발청_새만금 방조제 교통량_20230831.csv', encoding='euc-kr')

In [6]:
df2 = pd.read_csv('/content/drive/MyDrive/새만금/새만금개발청_새만금지역 공연행사_20230830.csv', encoding = 'euc-kr')

In [7]:
df3 = pd.read_csv('/content/drive/MyDrive/새만금/새만금개발청_새만금지역 축제현황_20230830.csv', encoding = 'euc-kr')

In [8]:
df1.head()

Unnamed: 0,조사일 년,조사월,출발,도착지,대형 차량,소형 차량
0,2022,1,부안,군산,1096,32534
1,2022,2,부안,군산,984,29186
2,2022,3,부안,군산,963,27774
3,2022,4,부안,군산,1787,42658
4,2022,5,부안,군산,2210,44554


In [9]:
df1.tail()

Unnamed: 0,조사일 년,조사월,출발,도착지,대형 차량,소형 차량
31,2023,2,군산,부안,29376,72107
32,2023,3,군산,부안,4953,114962
33,2023,4,군산,부안,8160,139575
34,2023,5,군산,부안,6790,131158
35,2023,6,군산,부안,6062,127995


In [10]:
df2['행사시작일'] = pd.to_datetime(df2['행사시작일'])
df2['행사종료일'] = pd.to_datetime(df2['행사종료일'])
df3['축제시작일'] = pd.to_datetime(df3['축제시작일'])
df3['축제종료일'] = pd.to_datetime(df3['축제종료일'])

In [11]:
def expand_dates(df, start_col, end_col, label):
    rows = []
    for _, row in df.iterrows():
        for date in pd.date_range(row[start_col], row[end_col]):
            rows.append({'year': date.year, 'month': date.month, label: 1})
    return pd.DataFrame(rows)

In [12]:
event_dates = expand_dates(df2, '행사시작일', '행사종료일', '행사')
festival_dates = expand_dates(df3, '축제시작일', '축제종료일', '축제')

In [13]:
monthly_event = event_dates.groupby(['year', 'month']).count().reset_index()
monthly_festival = festival_dates.groupby(['year', 'month']).count().reset_index()

In [14]:
monthly_event.columns = ['조사일 년', '조사월', '행사수']
monthly_festival.columns = ['조사일 년', '조사월', '축제수']

In [15]:
df1 = df1.merge(monthly_event, on=['조사일 년', '조사월'], how='left')
df1 = df1.merge(monthly_festival, on=['조사일 년', '조사월'], how='left')

In [16]:
df1[['행사수', '축제수']] = df1[['행사수', '축제수']].fillna(0)

In [17]:
df1.head()

Unnamed: 0,조사일 년,조사월,출발,도착지,대형 차량,소형 차량,행사수,축제수
0,2022,1,부안,군산,1096,32534,0.0,0.0
1,2022,2,부안,군산,984,29186,0.0,0.0
2,2022,3,부안,군산,963,27774,0.0,0.0
3,2022,4,부안,군산,1787,42658,0.0,0.0
4,2022,5,부안,군산,2210,44554,0.0,27.0


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# 1. 데이터 준비
df_model = pd.get_dummies(df1, columns=['출발', '도착지'])

X = df_model[['조사일 년', '조사월', '행사수', '축제수'] + [col for col in df_model.columns if col.startswith('출발_') or col.startswith('도착지_')]]
y = df_model['소형 차량']

# 2. 데이터 분리 및 정규화
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. 모델 정의
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "SVR": SVR()
}

# 4. 평가 함수
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)

    return {
        "Model": name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2 Score": r2
    }

# 5. 모든 모델 평가
results = []
for name, model in models.items():
    if name in ['SVR', 'LinearRegression']:
        # SVR과 선형 회귀는 정규화된 데이터 사용
        res = evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)
    else:
        res = evaluate_model(name, model, X_train, X_test, y_train, y_test)
    results.append(res)

# 6. 결과 보기
results_df = pd.DataFrame(results)
results_df.sort_values(by='RMSE')


Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score
1,RandomForest,19775.4425,694471900.0,26352.834939,0.522894
0,LinearRegression,24211.666386,832438700.0,28852.014494,0.42811
2,GradientBoosting,21857.469282,904101800.0,30068.28493,0.378878
3,XGBoost,23943.011719,988486100.0,31440.198473,0.320905
4,SVR,50362.957289,3568795000.0,59739.391836,-1.45178


In [19]:
# 1. 데이터 준비
df_model = pd.get_dummies(df1, columns=['출발', '도착지'])

X = df_model[['조사일 년', '조사월', '행사수', '축제수'] + [col for col in df_model.columns if col.startswith('출발_') or col.startswith('도착지_')]]
y = df_model['대형 차량']

# 2. 데이터 분리 및 정규화
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. 모델 정의
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "SVR": SVR()
}

# 4. 평가 함수
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)

    return {
        "Model": name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2 Score": r2
    }

# 5. 모든 모델 평가
results = []
for name, model in models.items():
    if name in ['SVR', 'LinearRegression']:
        # SVR과 선형 회귀는 정규화된 데이터 사용
        res = evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)
    else:
        res = evaluate_model(name, model, X_train, X_test, y_train, y_test)
    results.append(res)

# 6. 결과 보기
results_df = pd.DataFrame(results)
results_df.sort_values(by='RMSE')


Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score
1,RandomForest,3844.88125,73850550.0,8593.634552,0.003195
0,LinearRegression,4033.900545,74096840.0,8607.952232,-0.000129
2,GradientBoosting,3749.608816,75525830.0,8690.559903,-0.019417
3,XGBoost,3707.303467,75676100.0,8699.200883,-0.021445
4,SVR,4938.63432,96214640.0,9808.906388,-0.298666
