<a href="https://colab.research.google.com/github/denisshaf/DSPractice/blob/DS-2.1/notebooks/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Practice DS - 2.1] Baseline modeling pt.1

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from typing import Dict
import gc
from time import time

## MyDrive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ln -s '/content/drive/MyDrive' '/mydrive'

ln: failed to create symbolic link '/mydrive/MyDrive': Input/output error


## Kaggle

In [33]:
!pip install kaggle



In [40]:
!mkdir -p /root/.kaggle/
!cp /mydrive/kaggle.json /root/.kaggle/

## Read data

In [4]:
dtypes = {'name2': 'object', 'name3': 'object'}
data = dd.read_parquet('/mydrive/DSPractice/data/eda/features.parquet', dtype=dtypes)

In [5]:
data

Unnamed: 0_level_0,date_block_num,shop_id,item_id,avg_price,item_cnt_month,item_revenue,shop_month_revenue,relative_delta_revenue,relative_shop_delta_revenue,shop_month_revenue_lag_1,shop_month_revenue_lag_4,shop_month_revenue_lag_12,relative_shop_delta_revenue_lag_1,relative_shop_delta_revenue_lag_4,relative_shop_delta_revenue_lag_12,month,year,city,type,item_category_id,name2,name3,category,subcategory
npartitions=66,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
,float64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,int64,string,string,int64,string,string,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
train_data = data[data['date_block_num'].between(0, 31)]
val_data = data[data['date_block_num'] == 32]
test_data = data[data['date_block_num'] == 33]

In [14]:
test_data = pd.read_csv('/mydrive/DSPractice/data/after_dqc/test.csv')

In [15]:
test_data

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268
...,...,...,...
214195,214195,45,18454
214196,214196,45,16188
214197,214197,45,15757
214198,214198,45,19648


## Simple previous months average

In [31]:
class PrevMonthAverage(BaseEstimator):
  "This model calculates the value for the next month previous `month_count` months"

  def __init__(self, *, month_count, int_output=False):
    self.month_count = month_count
    self.int_output = int_output

  def fit(self, X, y):

    if X.shape[0] != y.shape[0]:
      raise ValueError("Invalid shapes: X.shape[0] must be equal to y.shape[0]")

    last_month = X['date_block_num'].iloc[-1].item()

    if last_month + 1 < self.month_count:
      raise ValueError("Invalid value `month_count`: `month_count` must be greater or equal max(X['date_block_num']) + 1")

    start_month = last_month - self.month_count + 1
    self.data_ = X[X['date_block_num'].between(start_month, last_month)][['shop_id', 'item_id']]
    self.data_ = self.data_.merge(y, how='left', left_index=True, right_index=True)
    self.data_ = self.data_.rename(columns={y.name: 'target'})

    return self

  def predict(self, X):
    check_is_fitted(self)

    index_name = X.index.name if X.index.name is not None else 'index'
    X = X.reset_index().merge(self.data_, how='left', on=['shop_id', 'item_id']).set_index(index_name)

    y_pred = X.groupby(['shop_id', 'item_id']).mean('target')
    if self.int_output:
      y_pred = y_pred.round(0)
    return y_pred.reset_index()['target']

In [8]:
class ReplaceTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, replace: Dict[str, Dict]):
    self.replace = replace

  def fit(self, X, y=None):
      return self

  def transform(self, X, y=None):
      return X.replace(self.replace)

In [9]:
replace_shops_id = {'shop_id': {57: 0, 58: 1, 11: 10}}

In [10]:
X_train = train_data[['date_block_num', 'shop_id', 'item_id']].astype('float32').compute()
y_train = train_data['item_cnt_month'].astype('float32').compute()

X_val = val_data[['date_block_num', 'shop_id', 'item_id']].astype('float32').compute()
y_val = val_data['item_cnt_month'].astype('float32').compute()

X_test = test_data[['date_block_num', 'shop_id', 'item_id']].astype('float32').compute()
y_test = test_data['item_cnt_month'].astype('float32').compute()

In [128]:
pipe = Pipeline([('replacer', ReplaceTransformer(replace_shops_id)), ('predictor', PrevMonthAverage(month_count=5, int_output=True))])
pipe.fit(X_train, y_train)
y_val_pred = pipe.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred)

In [28]:
ind = pd.Index([1, 2])
ind.name is None

True

In [122]:
rmse

9.250426

In [None]:
pipe = Pipeline([('replacer', ReplaceTransformer(replace_shops_id)), ('predictor', PrevMonthAverage(month_count=20, int_output=False))])
pipe.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
y_test_pred = pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pred)

In [None]:
rmse

In [32]:
pipe = Pipeline([('replacer', ReplaceTransformer(replace_shops_id)), ('predictor', PrevMonthAverage(month_count=5, int_output=False))])
pipe.fit(pd.concat([X_train, X_val, X_test]), pd.concat([y_train, y_val, y_test]))
baseline_submission = pipe.predict(test_data[['shop_id', 'item_id']])

In [36]:
baseline_submission = baseline_submission.reset_index()
baseline_submission.columns = ['ID', 'item_cnt_month']

In [38]:
baseline_submission.to_csv('/content/submission_csv', index=False)

In [41]:
!kaggle competitions submit competitive-data-science-predict-future-sales -f submission_csv -m "PrevMonthAverage baseline"

100% 2.14M/2.14M [00:00<00:00, 8.25MB/s]
Successfully submitted to Predict Future Sales

Baseline score on Kaggle: 1.76526