# train

## 1. 패키지, 데이터 불러오기

In [53]:
import pandas as pd
import numpy as np
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet
)
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures


In [54]:
df = pd.read_csv('C:/Users/eunseok/Desktop/vscode/data/train.csv')

## 2. EDA

### 1. 전반적으로 확인

In [55]:
print(df.head())


         Date  Rented Bike Count  Hour  Temperature(째C)  Humidity(%)  \
0  01/12/2017                254     0             -5.2           37   
1  01/12/2017                204     1             -5.5           38   
2  01/12/2017                173     2             -6.0           39   
3  01/12/2017                107     3             -6.2           40   
4  01/12/2017                 78     4             -6.0           36   

   Wind speed (m/s)  Visibility (10m)  Dew point temperature(째C)  \
0               2.2              2000                      -17.6   
1               0.8              2000                      -17.6   
2               1.0              2000                      -17.7   
3               0.9              2000                      -17.6   
4               NaN              2000                      -18.6   

   Solar Radiation (MJ/m2)  Rainfall(mm)  Snowfall (cm) Seasons     Holiday  \
0                      0.0           0.0            0.0     NaN  No Holiday   


In [56]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8040 entries, 0 to 8039
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       8040 non-null   object 
 1   Rented Bike Count          8040 non-null   int64  
 2   Hour                       8040 non-null   int64  
 3   Temperature(째C)            7240 non-null   float64
 4   Humidity(%)                8040 non-null   int64  
 5   Wind speed (m/s)           7238 non-null   float64
 6   Visibility (10m)           8040 non-null   int64  
 7   Dew point temperature(째C)  8040 non-null   float64
 8   Solar Radiation (MJ/m2)    8040 non-null   float64
 9   Rainfall(mm)               8040 non-null   float64
 10  Snowfall (cm)              8040 non-null   float64
 11  Seasons                    7238 non-null   object 
 12  Holiday                    8040 non-null   object 
 13  Functioning Day            8040 non-null   objec

In [57]:
print(df.describe(include="all"))

              Date  Rented Bike Count         Hour  Temperature(째C)  \
count         8040        8040.000000  8040.000000      7240.000000   
unique         335                NaN          NaN              NaN   
top     01/12/2017                NaN          NaN              NaN   
freq            24                NaN          NaN              NaN   
mean           NaN         709.775995    11.500000        13.498564   
std            NaN         657.320605     6.922617        12.270471   
min            NaN           0.000000     0.000000       -17.800000   
25%            NaN         191.000000     5.750000         3.500000   
50%            NaN         485.000000    11.500000        15.600000   
75%            NaN        1080.500000    17.250000        23.300000   
max            NaN        3556.000000    23.000000        39.400000   

        Humidity(%)  Wind speed (m/s)  Visibility (10m)  \
count   8040.000000       7238.000000       8040.000000   
unique          NaN          

### 2. 결측치 확인

- column별 결측치 합

In [58]:
print(df.isna().sum())

Date                           0
Rented Bike Count              0
Hour                           0
Temperature(째C)              800
Humidity(%)                    0
Wind speed (m/s)             802
Visibility (10m)               0
Dew point temperature(째C)      0
Solar Radiation (MJ/m2)        0
Rainfall(mm)                   0
Snowfall (cm)                  0
Seasons                      802
Holiday                        0
Functioning Day                0
dtype: int64


- row별 결측치 개수

In [59]:
print(df.isna().sum(axis=1).value_counts())

0    5851
1    1983
2     197
3       9
Name: count, dtype: int64


#### 1. Wind speed (m/s) 결측치 처리

- 다른 요인과 독립적이라고 생각하고 그냥 전체평균으로 대체

In [60]:
global_wind_mean = df["Wind speed (m/s)"].mean(skipna=True)
df["Wind speed (m/s)"].fillna(global_wind_mean, inplace=True)

- 확인

In [61]:
print(df["Wind speed (m/s)"].isna().sum())

0


#### 2. Seasons 결측치 처리

- 계절 매핑 함수(date 이용)

In [62]:
def month_to_season(month: int) -> str:
    if month in (3, 4, 5):
        return "Spring"
    elif month in (6, 7, 8):
        return "Summer"
    elif month in (9, 10, 11):
        return "Autumn"
    else:
        return "Winter"

- 계절로 변경

In [63]:
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")

In [64]:
df["Seasons"] = df["Date"].dt.month.apply(month_to_season)

- 바뀌었는지 확인

In [65]:
print(df["Seasons"].isna().sum())

0


#### 3. Temperature(째C) 결측치 처리

- 계절별로 온도가 다른거 확인

In [66]:
season_temp_means = df.groupby("Seasons")["Temperature(째C)"].mean()
season_temp_means

Seasons
Autumn    17.274578
Spring    13.147726
Summer    26.610258
Winter    -2.485670
Name: Temperature(째C), dtype: float64

- 계절별 온도로 대체

In [67]:
df["Temperature(째C)"] = df.apply(
    lambda row: season_temp_means[row["Seasons"]]
                if pd.isna(row["Temperature(째C)"])
                else row["Temperature(째C)"],
    axis=1
)

- 확인

In [68]:
print(df["Temperature(째C)"].isna().sum())

0


#### 4. 확인

In [69]:
print(df.isna().sum())

Date                         0
Rented Bike Count            0
Hour                         0
Temperature(째C)              0
Humidity(%)                  0
Wind speed (m/s)             0
Visibility (10m)             0
Dew point temperature(째C)    0
Solar Radiation (MJ/m2)      0
Rainfall(mm)                 0
Snowfall (cm)                0
Seasons                      0
Holiday                      0
Functioning Day              0
dtype: int64


In [70]:
print(df.isna().sum(axis=1).value_counts())

0    8040
Name: count, dtype: int64


- 결측치 없는걸 확인

### 3. 범주형 변수 변환

In [71]:
for col in ["Seasons", "Holiday", "Functioning Day"]:
    df[col] = df[col].astype("category")


In [72]:
print(df[["Seasons","Holiday","Functioning Day"]].describe())

       Seasons     Holiday Functioning Day
count     8040        8040            8040
unique       4           2               2
top     Spring  No Holiday             Yes
freq      2208        7608            7817


- 더미변수화

In [73]:
df = pd.get_dummies(df,
                        columns=["Seasons","Holiday","Functioning Day"],
                        drop_first=True)

In [74]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8040 entries, 0 to 8039
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       8040 non-null   datetime64[ns]
 1   Rented Bike Count          8040 non-null   int64         
 2   Hour                       8040 non-null   int64         
 3   Temperature(째C)            8040 non-null   float64       
 4   Humidity(%)                8040 non-null   int64         
 5   Wind speed (m/s)           8040 non-null   float64       
 6   Visibility (10m)           8040 non-null   int64         
 7   Dew point temperature(째C)  8040 non-null   float64       
 8   Solar Radiation (MJ/m2)    8040 non-null   float64       
 9   Rainfall(mm)               8040 non-null   float64       
 10  Snowfall (cm)              8040 non-null   float64       
 11  Seasons_Spring             8040 non-null   bool          
 12  Season

## 3. 회귀분석

In [75]:
X = df.drop(columns=["Date", "Rented Bike Count"])
y = df["Rented Bike Count"]

#### multilinear

In [76]:
selector_lin = SelectKBest(score_func=f_regression, k="all")
selector_lin.fit(X, y)
f_scores_lin = pd.Series(selector_lin.scores_, index=X.columns) \
                 .sort_values(ascending=False)

In [77]:
n = len(y)           
p = X.shape[1]

In [78]:
lin_results = []
for K in range(1, len(X.columns) + 1):
    topK = f_scores_lin.index[:K]
    Xk   = X[topK]
    model = LinearRegression().fit(Xk, y)
    r2    = r2_score(y, model.predict(Xk))
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    lin_results.append({
        "K":        K,
        "Features": list(topK),
        "R2":       r2,
        "Adjusted R2":      adj_r2
    })

In [79]:
lin_df = pd.DataFrame(lin_results).sort_values("R2", ascending=False)
print(lin_df)

     K                                           Features        R2  \
13  14  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.556177   
12  13  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.550138   
11  12  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.548785   
10  11  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.548594   
9   10  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.539226   
8    9  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.537525   
7    8  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.484895   
6    7  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.463053   
5    6  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.459688   
4    5  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.457697   
3    4  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.457531   
2    3            [Temperature(째C), Seasons_Winter, Hour]  0.427190   
1    2                  [Temperature(째C), Seasons_Winter]  0.296948   
0    1

In [80]:
lin_df2 = pd.DataFrame(lin_results).sort_values("Adjusted R2", ascending=False)
print(lin_df2)

     K                                           Features        R2  \
13  14  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.556177   
12  13  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.550138   
11  12  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.548785   
10  11  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.548594   
9   10  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.539226   
8    9  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.537525   
7    8  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.484895   
6    7  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.463053   
5    6  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.459688   
4    5  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.457697   
3    4  [Temperature(째C), Seasons_Winter, Hour, Dew po...  0.457531   
2    3            [Temperature(째C), Seasons_Winter, Hour]  0.427190   
1    2                  [Temperature(째C), Seasons_Winter]  0.296948   
0    1

- 다 넣은게 가장 좋음

#### polynominal

In [81]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
poly_features = poly.get_feature_names_out(X.columns)
X_poly_df = pd.DataFrame(X_poly, columns=poly_features)

In [82]:
selector_poly = SelectKBest(score_func=f_regression, k="all")
selector_poly.fit(X_poly_df, y)
f_scores_poly = pd.Series(selector_poly.scores_, index=poly_features) \
                   .sort_values(ascending=False)

In [83]:
poly_results = []
for K in range(1, len(poly_features) + 1):
    topK = f_scores_poly.index[:K]
    Xpk  = X_poly_df[topK]
    model = LinearRegression().fit(Xpk, y)
    r2    = r2_score(y, model.predict(Xpk))
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    poly_results.append({
        "K":        K,
        "Features": list(topK),
        "R2":       r2,
        "Adjusted R2":      adj_r2
    })

In [84]:
poly_df  = pd.DataFrame(poly_results).sort_values("R2", ascending=False)
print(poly_df)

       K                                           Features        R2  \
118  119  [Hour Temperature(째C), Temperature(째C) Functio...  0.715074   
117  118  [Hour Temperature(째C), Temperature(째C) Functio...  0.715074   
116  117  [Hour Temperature(째C), Temperature(째C) Functio...  0.715074   
115  116  [Hour Temperature(째C), Temperature(째C) Functio...  0.715074   
114  115  [Hour Temperature(째C), Temperature(째C) Functio...  0.715074   
..   ...                                                ...       ...   
4      5  [Hour Temperature(째C), Temperature(째C) Functio...  0.525345   
3      4  [Hour Temperature(째C), Temperature(째C) Functio...  0.524266   
2      3  [Hour Temperature(째C), Temperature(째C) Functio...  0.472557   
1      2  [Hour Temperature(째C), Temperature(째C) Functio...  0.472106   
0      1                             [Hour Temperature(째C)]  0.464044   

     Adjusted R2  
118     0.714577  
117     0.714577  
116     0.714577  
115     0.714577  
114     0.714577  
..       

- 다 넣은게 가장 좋음

#### 여러 종류끼리 비교

In [85]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=0.1, random_state=42),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
}

In [86]:
results = []
for name, mdl in models.items():
    mdl.fit(X, y)
    preds = mdl.predict(X)
    results.append({
        "Model": name,
        "Validation MSE": mean_squared_error(y, preds),
        "Validation R2":   r2_score(y, preds)
    })

In [87]:
results_df = pd.DataFrame(results).sort_values("Validation R2", ascending=False)
print(results_df)

              Model  Validation MSE  Validation R2
0  LinearRegression   191739.071693       0.556177
1             Ridge   191739.902296       0.556175
2             Lasso   191740.952979       0.556172
3        ElasticNet   205390.618963       0.524577


- 선형회귀가 가장 R^2가 높음

# test

In [88]:
test_df =  pd.read_csv('C:/Users/eunseok/Desktop/vscode/data/test.csv')

## 1. eda

In [89]:
print(test_df.head())

         Date  Hour  Temperature(째C)  Humidity(%)  Wind speed (m/s)  \
0  01/11/2018     0              5.7           62               2.3   
1  01/11/2018     1              5.1           65               1.6   
2  01/11/2018     2              4.4           64               NaN   
3  01/11/2018     3              4.2           65               0.7   
4  01/11/2018     4              3.6           70               0.8   

   Visibility (10m)  Dew point temperature(째C)  Solar Radiation (MJ/m2)  \
0              1909                       -1.0                      0.0   
1              1932                       -0.9                      0.0   
2              2000                       -1.8                      0.0   
3              1962                       -1.8                      0.0   
4              1934                       -1.3                      0.0   

   Rainfall(mm)  Snowfall (cm) Seasons     Holiday Functioning Day  
0           0.0            0.0  Autumn  No Holiday   

In [90]:
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       720 non-null    object 
 1   Hour                       720 non-null    int64  
 2   Temperature(째C)            644 non-null    float64
 3   Humidity(%)                720 non-null    int64  
 4   Wind speed (m/s)           646 non-null    float64
 5   Visibility (10m)           720 non-null    int64  
 6   Dew point temperature(째C)  720 non-null    float64
 7   Solar Radiation (MJ/m2)    720 non-null    float64
 8   Rainfall(mm)               720 non-null    float64
 9   Snowfall (cm)              720 non-null    float64
 10  Seasons                    646 non-null    object 
 11  Holiday                    720 non-null    object 
 12  Functioning Day            720 non-null    object 
dtypes: float64(6), int64(3), object(4)
memory usage: 7

In [91]:
print(test_df.describe(include="all"))

              Date        Hour  Temperature(째C)  Humidity(%)  \
count          720  720.000000       644.000000   720.000000   
unique          30         NaN              NaN          NaN   
top     01/11/2018         NaN              NaN          NaN   
freq            24         NaN              NaN          NaN   
mean           NaN   11.500000         7.717857    57.726389   
std            NaN    6.926999         4.561766    20.613305   
min            NaN    0.000000        -3.000000    13.000000   
25%            NaN    5.750000         4.200000    41.000000   
50%            NaN   11.500000         7.400000    57.000000   
75%            NaN   17.250000        11.100000    75.000000   
max            NaN   23.000000        19.300000    97.000000   

        Wind speed (m/s)  Visibility (10m)  Dew point temperature(째C)  \
count         646.000000        720.000000                 720.000000   
unique               NaN               NaN                        NaN   
top         

## 2. 결측치 확인

In [92]:
print(test_df.isna().sum())

Date                          0
Hour                          0
Temperature(째C)              76
Humidity(%)                   0
Wind speed (m/s)             74
Visibility (10m)              0
Dew point temperature(째C)     0
Solar Radiation (MJ/m2)       0
Rainfall(mm)                  0
Snowfall (cm)                 0
Seasons                      74
Holiday                       0
Functioning Day               0
dtype: int64


In [93]:
print(test_df.isna().sum(axis=1).value_counts())

0    516
1    185
2     18
3      1
Name: count, dtype: int64


- 결측치가 있는걸 확인

In [94]:
global_wind_mean2 = test_df["Wind speed (m/s)"].mean(skipna=True)
test_df["Wind speed (m/s)"].fillna(global_wind_mean2, inplace=True)

In [95]:
test_df["Date"] = pd.to_datetime(test_df["Date"], format="%d/%m/%Y")
test_df["Seasons"] = test_df["Date"].dt.month.apply(month_to_season)

- 계절하나니까 그냥 전체 온도평균으로 대체

In [96]:
global_temp = test_df["Temperature(째C)"].mean(skipna=True)
test_df["Temperature(째C)"].fillna(global_temp, inplace=True)

In [97]:
print(test_df.isna().sum())

Date                         0
Hour                         0
Temperature(째C)              0
Humidity(%)                  0
Wind speed (m/s)             0
Visibility (10m)             0
Dew point temperature(째C)    0
Solar Radiation (MJ/m2)      0
Rainfall(mm)                 0
Snowfall (cm)                0
Seasons                      0
Holiday                      0
Functioning Day              0
dtype: int64


## 4. 회귀모델

In [98]:
final_model = LinearRegression()
final_model.fit(X, y)

In [99]:
test_dummies = pd.get_dummies(
    test_df,
    columns=["Seasons","Holiday","Functioning Day"],
    drop_first=False
)

In [100]:
feature_cols = df.columns

In [101]:
X_test_final = test_dummies.reindex(columns=X.columns, fill_value=0)

In [102]:
preds = final_model.predict(X_test_final)

- 예측값을 test에 넣기

In [103]:
test_df['Rented Bike Count'] = preds
print(test_df[['Rented Bike Count']].head())

   Rented Bike Count
0         473.095140
1         448.243521
2         465.348063
3         468.804114
4         437.244165
