In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
import sklearn
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
sns.set(font_scale=1.4)

# 데이터 불러오기
train = pd.read_csv('train.csv')
test_ = pd.read_csv('test.csv')
test = test_.copy() # 데이터 보존

### 데이터 전처리

In [2]:
#수치형 변수에 포함되는 데이터 타입 선정

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

#결측치는 위 데이터로 채우기

train[train.select_dtypes(include=numerics).columns] = \
    train[train.select_dtypes(include=numerics).columns].fillna(method='ffill')

test[test.select_dtypes(include=numerics).columns] = \
    test[test.select_dtypes(include=numerics).columns].fillna(method='ffill')

In [3]:
test_not_index = test.drop(['index'], axis=1)
train_not_index = train.drop(['index'], axis=1)

train_not_index.drop(['day'], inplace=True, axis=1)
test_not_index.drop(['day'], inplace=True, axis=1)

In [4]:
# month, hour 은 범주형으로 변경
# train['month'] = train['month'].astype('category')
# train['hour'] = train['hour'].astype('category')

# month와 hour 범주화 타입으로 변경
# test['month'] = test['month'].astype('category')
# test['hour'] = test['hour'].astype('category')

In [5]:
# weather 은 범주형 Feature 입니다.
# 숫자로 표현돼 있으니 One-Hot Encoding을 진행해 줍니다.
# One-hot encoding 진행
train_not_index = pd.get_dummies(train_not_index, columns=['weather_condition'])
test_not_index = pd.get_dummies(test_not_index, columns=['weather_condition'])

In [6]:
# skin_temp 제거
train_not_index.drop(['skin_temp'], inplace=True, axis=1)
test_not_index.drop(['skin_temp'], inplace=True, axis=1)

## 검증 데이터로 분리

베이스 코드에는 모델의 성능을 비교해보기 위해 train 데이터에서 일부를 validation 데이터로 나눠서 진행

### Train, Test data 분할하기

In [7]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_not_index, test_size = 0.2, random_state=2023, shuffle=True)

In [8]:
print(train_data.shape)
print(val_data.shape)

(3910, 8)
(978, 8)


In [9]:
X_train = train_data.drop(['count'], axis = 1)
X_val = val_data.drop(['count'], axis = 1)
X_test = test_not_index

y_train = train_data['count']
y_val = val_data['count']


print("X_train의 shape: ", X_train.shape)
print("X_val의 shape: ", X_val.shape)
print("X_test의 shape: ", X_test.shape)
print("\t")
print("y_train의 shape: ", y_train.shape)
print("y_val의 shape: ", y_val.shape)

X_train의 shape:  (3910, 7)
X_val의 shape:  (978, 7)
X_test의 shape:  (576, 7)
	
y_train의 shape:  (3910,)
y_val의 shape:  (978,)


In [10]:
X_test

Unnamed: 0,month,hour,real_temp,humidity,windspeed,weather_condition_1,weather_condition_2
0,1,0,18.86,77.0,22.0028,0,1
1,1,1,18.86,77.0,22.0028,0,1
2,1,2,18.04,51.0,32.9975,0,1
3,1,3,18.04,51.0,32.9975,0,1
4,1,4,13.94,46.0,35.0008,1,0
...,...,...,...,...,...,...,...
571,12,19,15.58,50.0,26.0027,1,0
572,12,20,14.76,50.0,15.0013,1,0
573,12,21,13.94,61.0,15.0013,1,0
574,12,22,13.94,61.0,6.0032,1,0


### 모델링

In [11]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error

In [12]:
# 전동킥보드 수요를 예측하는 대회이므로 회귀(Regression) 모델을 사용합니다.
# 이 대회는 회귀 평가 지표중 하나인 R2(R Squared Score) 결정계수 Score 로 평가합니다.

from sklearn.metrics import r2_score
# r-square 함수사용
# r2 = r2_score(y_test, y_pred)

In [13]:
# rnd_clf = RandomForestRegressor(n_estimators=500, max_leaf_nodes=13, n_jobs=-1)
# rnd_clf.fit(X_train, y_train)

# y_rnd_pred = rnd_clf.predict(X_val)

# print(rnd_clf.score(X_train, y_train))
# print(rnd_clf.score(X_val, y_val))
# print(r2_score(y_val, y_rnd_pred))

0.70411101167062
0.7037316317492395
0.7037316317492395


In [5]:
#from pycaret.classification import *
from pycaret.regression import *
s = setup(X_train, y_train = 'charges', session_id = 123)

ModuleNotFoundError: No module named 'pycaret'

In [None]:
from pycaret.regression import RegressionExperiment
s = RegressionExperiment()

# 1. 데이터 준비
setup_clf = setup(data=X_train, target='count', train_size=0.7, 
                  normalize=True, normalize_method='minmax',
                  session_id=777)

In [None]:
# 2. 모델 비교
model = compare_models(sort='Accuracy', fold = 5, n_select=5)

In [None]:
# 3. 모델 튜닝 및 앙상블
# 모델 튜닝
tuned_model = [tune_model(i) for i in model]

# 모델 앙상블
blended_model = blend_models(estimator_list=tuned_model)

In [None]:
# 모델 성능평가
final_model = finalize_model(blended_model)
evaluate_model(final_model)

In [None]:
from pycaret.utils import check_metric

prediction = predict_model(final_model, data=test[test.columns.drop('diagnosis')])
print("테스트셋 정확도: "+str(check_metric(prediction['Label'], test['diagnosis'], metric = 'Accuracy')))

### 모델 선택

In [14]:
submission = pd.read_csv('submission.csv')

submission

Unnamed: 0,index,count
0,1,39.085695
1,2,39.085695
2,3,39.085695
3,4,39.085695
4,5,39.085695
...,...,...
571,572,341.705866
572,573,295.495745
573,574,129.151196
574,575,127.600107


# 결과 제출

예측한 결과값을 submission 파일로 생성 

In [15]:
test_predicted = rnd_clf.predict(X_test)

In [16]:
X_result = pd.concat([test['index'], X_test],axis=1)
X_result

Unnamed: 0,index,month,hour,real_temp,humidity,windspeed,weather_condition_1,weather_condition_2
0,1,1,0,18.86,77.0,22.0028,0,1
1,2,1,1,18.86,77.0,22.0028,0,1
2,3,1,2,18.04,51.0,32.9975,0,1
3,4,1,3,18.04,51.0,32.9975,0,1
4,5,1,4,13.94,46.0,35.0008,1,0
...,...,...,...,...,...,...,...,...
571,572,12,19,15.58,50.0,26.0027,1,0
572,573,12,20,14.76,50.0,15.0013,1,0
573,574,12,21,13.94,61.0,15.0013,1,0
574,575,12,22,13.94,61.0,6.0032,1,0


In [17]:
sample_sub = pd.DataFrame({'index': X_result['index'], 'count': test_predicted})
sample_sub

Unnamed: 0,index,count
0,1,39.128898
1,2,39.128898
2,3,39.128898
3,4,39.128898
4,5,39.128898
...,...,...
571,572,338.942710
572,573,296.529997
573,574,128.882873
574,575,126.450492


In [18]:
sample_sub.to_csv("submission.csv", index=False)