# Settings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_path = '/content/drive/MyDrive/data/train5.parquet'
test_path = '/content/drive/MyDrive/data/predict5.parquet'


In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 768 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


# Load data

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
#from category_encoders import TargetEncoder

In [None]:
categories = pd.read_csv('item_categories.csv')
items = pd.read_csv('items.csv')
shops = pd.read_csv('shops.csv')
sales = pd.read_csv('sales_train.csv')
# population = pd.read_csv('population_full.csv', delimiter=';')
# dollar = pd.read_csv('rub.csv').reset_index(level=[0])
# dollar.rub = [float(x.split(' ')[0]) for x in dollar.rub]
# dollar.columns=['date_block_num', 'dollar']

In [None]:
PREDICT_MONTH = 34

In [None]:
def merge(a, b, on=None, how = 'inner', check_left=False, check_right=False, fillna=None):
    result = pd.merge(a, b, on=on, how=how)
    if check_left:
        assert len(a) == len(result)
    if check_right:
        assert len(b) == len(result)
    if fillna is not None:
        result = result.fillna(fillna)
    return result

# Change types

In [None]:
shops.shop_id = shops.shop_id.astype(np.int8)
categories.item_category_id = categories.item_category_id.astype(np.int8)
items.item_id = items.item_id.astype(np.int16)
sales.item_id = sales.item_id.astype(np.int16)
sales.shop_id = sales.shop_id.astype(np.int8)
sales.date_block_num = sales.date_block_num.astype(np.int8)
sales.item_cnt_day = sales.item_cnt_day.astype(np.int32)
sales.item_price = sales.item_price.astype(np.float32)
# dollar.dollar = dollar.dollar.astype(np.float32)
# population['pop'] = population['pop'].astype(np.int32)


# Extract cities

In [None]:
shops.iloc[0,0] = shops.iloc[0,0][1:]
shops.iloc[1,0] = shops.iloc[1,0][1:]
shops['city'] = [x.split(' ')[0] for x in shops.shop_name]

# Extract categories

In [None]:
categories['cat'] = [x.split('-')[0].strip().split('(')[0].strip() for x in categories['item_category_name'] ]
categories['sub_cat'] = categories["item_category_name"].str.split(" - ").str.get(1).str.strip().fillna(categories['cat'])
categories['sub_cat'] = categories['sub_cat'].str.split('(').str.get(0).str.strip()

optional:



In [None]:
# categories['sub_cat'] = categories['sub_cat'].str.replace(' 360', '')
# categories['sub_cat'] = categories['sub_cat'].str.replace(' ONE', '')
# categories['sub_cat'] = categories['sub_cat'].str.replace(' 3D', '')
# categories['sub_cat'] = categories['sub_cat'].str.replace(' 4K', '')
# categories['sub_cat'] = categories['sub_cat'].str.replace(' 1С', '')
# categories['cat'] = ['Игры' if x == 'Игры PC' or x == 'Игры Android' or x == 'Игры MAC' else x for x in categories['cat']]

# Clean

In [None]:
shops = shops.drop(['shop_name'], axis=1)
categories = categories.drop(['item_category_name'], axis=1)
items = items.drop(['item_name'], axis=1)

In [None]:
len(sales)

2935849

In [None]:
2930306 - 2935849

-5543

# Fill 0 count and price

In [None]:
class MissiingDatesFiller():
    def __init__(self, len_shop, len_item, len_date):
        self.len_shop = len_shop
        self.len_item = len_item
        self.len_date = len_date
    
    def fit(self, X, y=None):
        return self

    def transform(self, sales, y=None):
        it_index, sh_index, date = [], [], []
        for i in range(self.len_item):
            it_index += [i]*(self.len_date * self.len_shop)
            sh_index += list(range(self.len_shop))* self.len_date
            for j in range(self.len_date):
                date += [j] * self.len_shop
        data = merge(
            pd.DataFrame({
                'item_id' : it_index,
                'shop_id': sh_index,
                'date_block_num' : date}), 
            pd.DataFrame(sales.groupby(
                ['item_id','shop_id', 'date_block_num'])['item_cnt_day'].sum()).reset_index(level = [0,1,2]),
            how='left', 
            on=['item_id','shop_id', 'date_block_num'], 
            fillna=0,
            check_left=True)
        data.columns = ['item_id', 'shop_id', 'date_block_num', 'item_cnt_month']
        it_index, sh_index, date = 0, 0, 0
        return data

    def fit_transform(self, X, Y=None):
        self.fit(X, Y)
        return self.transform(X, Y)

In [None]:
class MissingPriceFiller():
    def __init__(self, len_shop, len_item, len_date):
        self.len_shop = len_shop
        self.len_item = len_item
        self.len_date = len_date

    def fit(self, data, sales):
        return self

    def transform(self, data, sales):
        mean_price = sales.groupby(['item_id', 'shop_id', 'date_block_num'])['item_price'].mean().reset_index([0,1,2])
        data = merge(data, mean_price, 
                     on = ['item_id', 'shop_id', 'date_block_num'],
                     how = 'left',
                     check_left=True)
        mean_price_month = sales.groupby(['item_id', 'date_block_num'])[['item_price']].mean().reset_index([0,1])
        mean_price_month.columns = ['item_id', 'date_block_num','mean_item_price']
        it_index, date = [], []
        for i in range(len(items)):
            it_index += [i]*(self.len_date)
            date += list(range(self.len_date))
        mean_price_month = merge(
            mean_price_month, 
            pd.DataFrame({
                'item_id' : it_index,
                'date_block_num' : date}), 
                on = ['item_id', 'date_block_num'], 
            how = 'right',
            check_right=True)
        it_index, date = 0, 0
        mean_price_month.mean_item_price = mean_price_month.groupby('item_id').mean_item_price.fillna(method = 'ffill') 
        mean_price_month.mean_item_price = mean_price_month.groupby('item_id').mean_item_price.fillna(method = 'bfill')
        data = merge(data, 
             mean_price_month, 
             on = ['item_id', 'date_block_num'], 
             how = 'left',
             check_left=True)
        data.item_price = data.item_price.fillna(data['mean_item_price'])
        data = data.drop(['mean_item_price'], axis = 1)
        mean_price_month, mean_price = 0, 0
        return data

    def fit_transform(self, X, Y=None):
        self.fit(X, Y)
        return self.transform(X, Y)

In [None]:
data = MissiingDatesFiller(len(shops), len(items), PREDICT_MONTH + 1).fit_transform(sales)
data = MissingPriceFiller(len(shops), len(items), PREDICT_MONTH + 1).fit_transform(data,sales)

# Predict dollar

In [None]:
class DollarPredictor():
    def __init__(self, model, month):
        self.model = model
        self.month = month
    
    def fit_predict(self, df, y=None):
        X = df['date_block_num'].to_numpy()
        y = df.to_numpy()
        X = X.reshape(-1,1)
        self.model.fit(X,y)
        return self.model.predict(np.array([self.month]).reshape(1,-1))[0]

In [None]:
lr = LinearRegression()
dollar.loc[PREDICT_MONTH] = DollarPredictor(lr, PREDICT_MONTH).fit_predict(dollar)

# Merge all and fill unknown prices of items 

In [None]:
#shops = merge(shops, population, on = 'city', check_left=True)
data = merge(data, shops, on = 'shop_id', check_left=True)
data = merge(data, items, on = 'item_id', check_left=True)
data = merge(data, categories, on = 'item_category_id', check_left=True)
#data = merge(data, dollar, on = 'date_block_num', check_left=True)
len(data)

46557000

In [None]:
data['item_price'] = data['item_price'].fillna(data.groupby('sub_cat')['item_price'].transform('mean'))

# Clean data

In [None]:
data.item_id = data.item_id.astype(np.int16)
data.shop_id = data.shop_id.astype(np.int8)
data.date_block_num = data.date_block_num.astype(np.int8)
data.item_cnt_month = data.item_cnt_month.astype(np.int16)
data.item_price = data.item_price.astype(np.float32)
#data.dollar = data.dollar.astype(np.float32)

In [None]:
data = data.drop(['item_category_id'], axis=1)

In [None]:
len(data)     

46557000

# Rolling features

In [None]:
from category_encoders import TargetEncoder

In [None]:
class RollingFeatureCreator():
    def __init__(self, params):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for agg_name, window, func in self.params:
            col_name = agg_name
            if agg_name.endswith('_id'):
                col_name = agg_name[:-3]
            func_name = func
            if func == 'mean':
                func_name = 'avg'
            col_name = f'rol_{func_name}_{col_name}{window}'
            sample = X.groupby([agg_name, 'date_block_num'])[['item_cnt_month']].sum().reset_index([0,1])
            sample[col_name] = sample.groupby(agg_name)['item_cnt_month'].shift(1)
            sample[col_name] = sample.groupby(agg_name)[col_name].rolling(window).agg({col_name: func}).reset_index(0, drop=True)
            sample[col_name] = sample[col_name].fillna(0)
            X = merge(X, 
                      sample[[agg_name, 'date_block_num', col_name]], 
                      on=[agg_name, 'date_block_num'], 
                      how = 'left',
                      check_left=True)
            sample = 0
            if func in ['mean', 'median', 'std']:
                X[col_name] = X[col_name].astype(np.float32)
            else:
                X[col_name] = X[col_name].astype(np.int32)
        return X

In [None]:
class LagFeatureCreator():
    def __init__(self, params):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for window, target, fillna, new_type in self.params:
            col_name = f'lag_item_shop{window}'
            X[col_name] = X.groupby(['item_id', 'shop_id'])[target].shift(window).fillna(fillna)
            X[col_name] = X[col_name].astype(new_type)
        return X

In [None]:
class DifFeatureCreator():
    def __init__(self, params):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for name, x, y in self.params:
            X[name] = X[x] - X[y]
        return X

In [None]:
class CatTargetEncoder():
    def __init__(self, months, params):
        self.months = months[1:]
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, data, y=None):
        for col, encoded_name, target in self.params:
            for month in self.months:
                print(month)
                encoder = TargetEncoder()
                encoder.fit(data[data.date_block_num < month][col].astype(str), data[data.date_block_num < month][target])
                res = encoder.transform(data[data.date_block_num == month][col].astype(str))
                data.loc[data.date_block_num == month, encoded_name] = res.iloc[:,0]
        return data

In [None]:
class CatLabelEncoder():
    def __init__(self, col_names):
        self.col_names = col_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for col in self.col_names:
            encoder = LabelEncoder()
            X[col] = encoder.fit_transform(X[col])
            X[col] = X[col].astype(np.int8)
        return X



In [None]:
rol_lst1 = [ 
    ('city', 2, 'mean'),
    ('city', 6, 'mean'),
    ('sub_cat', 2, 'sum'),
    ('sub_cat', 4, 'mean'),
    ('sub_cat', 4, 'std'),
    ('sub_cat', 6, 'mean'),
    ('item_id', 4, 'median'),
    ('cat', 6, 'mean'),
    ('cat', 2, 'sum'),
    ('cat', 4, 'mean'),
    ('cat', 4, 'std'),
    ('shop_id', 2, 'mean'),
    ('shop_id', 4, 'mean'),
    ('shop_id', 6, 'median')
    ]
rol_lst2 = [
    ('item_id', 12, 'mean'),
    ('shop_id', 12, 'mean'),
    ('cat', 12, 'mean'),
    ('sub_cat', 12, 'mean')
]
rol_lst3 = [
    ('item_id', 2, 'mean'),
    ('item_id', 4, 'max'),
    ('item_id', 6, 'mean'),
    ('item_id', 6, 'std'),
]
rol_lst4 = [
    ('sub_cat', 5, 'min'),
    ('cat', 5, 'min'),
    ('city', 6, 'std'),
    ('shop_id', 5, 'min'),
    ('sub_cat', 5, 'median'),
    ('cat', 5, 'median'),
    ('cat', 1, 'sum'),
    ('sub_cat', 1, 'sum'),
    ('shop_id', 6, 'std'),
    ('item_id', 1, 'sum'),
    ('shop_id', 1, 'sum'),
    ('item_id', 8, 'mean'),
    ('shop_id', 8, 'mean'),
    ('cat', 8, 'mean'),
    ('sub_cat', 8, 'mean')
]
lag_lst = [
    (1, 'item_cnt_month', 0, np.int16),
    (2, 'item_cnt_month', 0, np.int16),
    (3, 'item_cnt_month', 0, np.int16), 
    (4, 'item_cnt_month', 0, np.int16), 
    (12, 'item_cnt_month', 0, np.int16)]
dif_lst = [
    ('dif2_1', 'lag_item_shop2', 'lag_item_shop1'),
    ('dif4_1', 'lag_item_shop4', 'lag_item_shop1'),
    ('dif12_1', 'lag_item_shop12', 'lag_item_shop1'),
    ('difitem6_2', 'rol_avg_item6', 'rol_avg_item2'),
]
cat_label_lst = ['cat', 'sub_cat', 'city']
cat_target_lst = [
   
]
cat_target_lst1 = [
    ('city', 'city_target', 'item_cnt_month'),
    ('cat', 'cat_target', 'item_cnt_month'),
    ('sub_cat', 'sub_cat_target', 'item_cnt_month'),
    ('shop_id', 'shop_target', 'item_cnt_month'),
    ('item_id', 'item_target', 'item_cnt_month'),
]
cat_target_lst2 = [
    (['city', 'cat'], 'city_cat_target', 'item_cnt_month'),
    (['city', 'sub_cat'], 'city_subcat_target', 'item_cnt_month'),
    (['item_id','shop_id'], 'item_shop_target', 'item_cnt_month'),
]

In [None]:
feature_creating_pipeline = Pipeline(steps=[
                                            ('lag',LagFeatureCreator(params=lag_lst)),
                                            ('rol', RollingFeatureCreator(params=rol_lst3)),
                                            ('dif',DifFeatureCreator(params=dif_lst))])
feature_creating_pipeline.fit(data)
data = feature_creating_pipeline.transform(data)

In [None]:

feature_creating_pipeline = Pipeline(steps=[
    ('target', CatTargetEncoder(months,cat_target_lst2))
                                            
                                            ])
feature_creating_pipeline.fit(data)
data = feature_creating_pipeline.transform(data)

In [None]:
cat_pipeline = Pipeline(steps=[
    ('target', CatTargetEncoder(cat_target_lst)),
    ('label', CatLabelEncoder(cat_label_lst))
])
cat_pipeline.fit(data)
data = cat_pipeline.transform(data)

In [None]:
cols = ['item_id', 'shop_id', 'item_cnt_month']

# Other fet

In [None]:
data['month'] = data['date_block_num'] % 12 + 1

In [None]:
temp = data.groupby(['date_block_num'])[['item_cnt_month']].sum().shift(1).fillna(0).reset_index([0])
temp.columns = ['date_block_num', 'lag_month1']
data = merge(data, temp, on='date_block_num', how='left', check_left=True)

In [None]:
temp = data[data.item_cnt_month > 0].groupby(['city', 'date_block_num'])[['shop_id']].nunique()
temp.columns = ['shop_cnt']
temp = temp.reset_index([0,1])

In [None]:
data = merge(data, temp, on=['city', 'date_block_num'], how='left', check_left=True, fillna=0)

In [None]:
data.lag_month1 = data.lag_month1.astype(np.int32)
data.shop_cnt = data.shop_cnt.astype(np.int8)

In [None]:
data['shop_per_pop'] = data['shop_cnt'] / data['pop']
data['shop_per_pop'] = data['shop_per_pop'].astype(np.float32)

In [None]:
temp[temp.city == 'Москва']

Unnamed: 0,city,date_block_num,shop_cnt
410,Москва,0,12
411,Москва,1,12
412,Москва,2,12
413,Москва,3,12
414,Москва,4,11
415,Москва,5,11
416,Москва,6,11
417,Москва,7,10
418,Москва,8,10
419,Москва,9,10


In [None]:
data.sample()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,rol_std_item_shop3,rol_std_item_shop6,month,lag_month1
3933048,11844,48,2,0,0.0,0.0,3,128090.0


# Save to .parquet

In [None]:
test = data[data.date_block_num == PREDICT_MONTH]
test.to_parquet('/content/drive/MyDrive/data/short_test.parquet')
#data[(data.date_block_num > 19) & (data.date_block_num < PREDICT_MONTH)].to_parquet('/content/drive/MyDrive/data/train2.parquet')

In [None]:
data[(data.date_block_num > 10)& (data.date_block_num < PREDICT_MONTH)].to_parquet('/content/drive/MyDrive/data/short_train.parquet')

In [None]:
data[(data.date_block_num > 19) & (data.date_block_num < 34)].to_parquet('/content/drive/MyDrive/data/train5.parquet')

In [None]:
test.to_parquet('/content/drive/MyDrive/data/predict5.parquet')