In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [2]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [3]:
train.loc[train['item'] == 1].pivot(index='date', columns='store', values='sales').sum(0).values

array([36468, 51445, 45778, 41886, 30566, 30527, 27681, 49272, 42593,
       45168])

In [None]:
%%time
import numpy as np
import datetime

min_date, max_date = datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 8)#datetime.datetime(2017, 12, 31)
days_len = 7
items = sorted(train['item'].drop_duplicates())

item_sales_dict = dict()
start_date = min_date
while True:
    end_date = start_date + datetime.timedelta(days=days_len)
    if end_date > max_date:
        break
    else:
        part = dict()
        start_str, end_str = start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
        input_parts = train.loc[train['date'] >= start_str].loc[train['date'] < end_str].sort_values(by=['date'])
        output_parts = train.loc[train['date'] == end_str]
        for i, item in enumerate(items):
            input_pivot = input_parts.loc[input_parts['item'] == item].pivot(index='store', columns='date', values='sales')
            output_pivot = output_parts.loc[output_parts['item'] == item].pivot(index='store', columns='date', values='sales')
            part[f'input_{item}'] = input_pivot.values
            part[f'output_{item}'] = output_pivot.values
            if i == 0:
                part['total'] = input_pivot.sum(0).values
                print(part['total'])
            else:
                part['total'] = np.vstack([part['total'], input_pivot.sum(0).values])
        item_sales_dict[start_str] = part
        print(start_str, item_sales_dict[start_str])

[133  99 127 145 149 149  92]
2013-01-01 {'input_1': array([[13, 11, 14, 13, 10, 12, 10],
       [12, 16, 16, 20, 16, 18, 12],
       [19,  8, 10, 15, 22, 22,  7],
       [10, 12,  8, 15, 19, 14, 12],
       [11,  9, 12,  8, 13, 15,  6],
       [20,  6, 11,  7, 12,  9, 11],
       [ 7,  4,  8,  7,  8, 10,  6],
       [16, 10, 12, 21, 15, 27,  9],
       [11,  9, 20, 22, 22, 11,  7],
       [14, 14, 16, 17, 12, 11, 12]]), 'output_1': array([[ 9],
       [11],
       [15],
       [16],
       [11],
       [ 9],
       [ 6],
       [10],
       [13],
       [19]]), 'total': array([[133,  99, 127, 145, 149, 149,  92],
       [327, 279, 304, 359, 377, 421, 253],
       [172, 213, 193, 218, 217, 246, 185],
       [102, 110, 112, 138, 144, 128, 100],
       [ 83,  81,  96, 108, 100, 125,  71],
       [288, 310, 309, 343, 343, 375, 246],
       [278, 291, 324, 335, 377, 342, 260],
       [416, 386, 398, 430, 481, 521, 346],
       [245, 225, 265, 313, 319, 326, 241],
       [378, 397, 388, 462

In [None]:
for store in train['store'].drop_duplicates():
    print(f'train store: {store}, items: {", ".join(train.loc[train["store"] == store, "item"].drop_duplicates().astype(str))}')
    print(f'test store: {store}, items: {", ".join(test.loc[test["store"] == store, "item"].drop_duplicates().astype(str))}')

In [None]:
import numpy as np
min_sales, max_sales = 0, 300
range_ = 10
sales_blocks = dict()
base = ord('a')
for j, i in enumerate(np.arange(min_sales, max_sales, range_)):
    low, high = i, i + range_ - 1
    sales_blocks[(low, high)] = chr(base + j)
sales_blocks[(max_sales, 1000000)] = chr(base + len(sales_blocks))
sales_blocks

In [None]:
def convert_code(sales_blocks, x):
    for k, v in sales_blocks.items():
        if x >= k[0] and x <= k[1]:
            return sales_blocks[k]

train['sales_processed'] = list(map(lambda x: convert_code(sales_blocks, x), train['sales']))
train.head()


In [None]:
train.to_csv('data/train_converted.csv', index=False)

In [None]:
%%time

import datetime

min_date, max_date = datetime.datetime(2013, 1, 1), datetime.datetime(2017, 12, 31)
days_len = 7
train['sales'] = train['sales'].astype(str)

for store in train['store'].drop_duplicates():
    start_time = datetime.datetime.now()
    dates, inputs_converted, inputs_org = [], [], []
    start_date = min_date
    while True:
        end_date = start_date + datetime.timedelta(days=days_len)
        if end_date > max_date:
            break

        start_str, end_str = start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
        input_parts = train.loc[train['store'] == store].loc[train['date'] >= start_str].loc[train['date'] < end_str].sort_values(by=['date', 'item'])
        output_parts = train.loc[train['store'] == store].loc[train['date'] == end_str]
        input_ = f'{store}:{start_str[5:]}'
        for item in input_parts['item'].drop_duplicates():
            inputC = f'date{start_str[5:]}:store{store}:item{item}-' + ''.join(input_parts.loc[input_parts['item'] == item, 'sales_processed'].values)
            inputO = f'date{start_str[5:]}:store{store}:item{item}-' + '>'.join(input_parts.loc[input_parts['item'] == item, 'sales'].values)
            dates.append(start_str)
            inputs_converted.append(inputC)
            inputs_org.append(inputO)
        start_date = start_date + datetime.timedelta(days=1)
    records = pd.DataFrame({'date': dates, 'seq_org': inputs_org, 'seq_cnv': inputs_converted})
    records.to_csv(f'data/store{store}_{days_len}days_converted.csv', index=False)
    print(f'dur: {datetime.datetime.now() - start_time}')

In [None]:
n_store, std_date = 10, '2016-01-01'
train, test = [], []
for i in range(n_store):
    whole = pd.read_csv(f'data/store{i + 1}_{days_len}days_converted.csv')
    train.append(whole.loc[whole['date'] < std_date])
    test.append(whole.loc[whole['date'] >= std_date])
train = pd.concat(train).reset_index(drop=True)
test = pd.concat(test).reset_index(drop=True)
val_idx = np.random.choice(np.arange(len(train)), round(len(train) * 0.1), replace=False)
train_idx = set(np.arange(len(train))) - set(val_idx)
val = train.iloc[sorted(val_idx)]
train = train.iloc[sorted(train_idx)]
train.to_csv(f'train_bert_data/train_{days_len}days.csv', index=False)
val.to_csv(f'train_bert_data/val_{days_len}days.csv', index=False)
test.to_csv(f'train_bert_data/test_{days_len}days.csv', index=False)

In [None]:
import os
from transformers import BertForMaskedLM, AutoTokenizer, pipeline
from skt.vault_utils import get_secrets

proxy = get_secrets("proxy")["proxy"]
os.environ['HTTP_PROXY'] = proxy
os.environ['HTTPS_PROXY'] = proxy

bert = BertForMaskedLM.from_pretrained('distilbert-base-uncased').cuda()
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
fill_mask = pipeline(
    "fill-mask",
    model=bert,
    tokenizer=tokenizer
)

In [None]:
bert.device

In [None]:
bert.cpu()
bert.device

In [None]:
bert.device

In [None]:
import torch
inputs = tokenizer("Hello, my [MASK] is cute", return_tensors="pt")
print(inputs.input_ids)
with torch.no_grad(): 
    outputs = bert(**inputs)
outputs.prediction_logits

In [None]:
text = torch("The body was recovered and sent to Cox's Bazar [MASK] morgue for autopsy")
text.cuda()
#fill_mask("The body was recovered and sent to Cox's Bazar [MASK] morgue for autopsy")

In [None]:
fill_mask("Hello, my [MASK] is cute")[0]['sequence'][6:-6]

In [None]:
val.head()

In [None]:
input_ids = tokenizer(list(val['seq_org'])).input_ids

In [None]:
len(input_ids[0])

In [11]:
import os
from skt.vault_utils import get_secrets
from transformers import RobertaTokenizer, RobertaModel

proxy = get_secrets("proxy")["proxy"]
os.environ['HTTP_PROXY'] = proxy
os.environ['HTTPS_PROXY'] = proxy



In [12]:
from transformers import ElectraTokenizer, ElectraModel, AutoModelForMaskedLM

model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-generator")
lmmodel = AutoModelForMaskedLM.from_pretrained("monologg/koelectra-base-v3-generator")
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-generator")

In [17]:
import torch

input_ids = tokenizer("나는 식당에서 밥을 먹었다.").input_ids
md_result = model(torch.tensor([input_ids]))
lmmd_result = lmmodel(torch.tensor([input_ids]))

In [22]:
md_result.last_hidden_state.shape

torch.Size([1, 13, 256])

In [33]:
md_result.last_hidden_state

tensor([[[ 0.0886, -0.1974, -0.0859,  ...,  0.2817,  0.2463, -0.1742],
         [-0.2716, -0.8261,  0.3865,  ...,  0.3734,  0.3926, -0.0354],
         [-0.1466, -0.2909,  0.4556,  ..., -0.0473, -0.0970,  0.0106],
         ...,
         [ 0.7419, -0.2018,  0.0279,  ...,  0.1788, -0.0729, -0.1511],
         [-0.2403,  0.1839, -0.0043,  ..., -0.1352,  0.0159,  0.0375],
         [ 0.0886, -0.1975, -0.0859,  ...,  0.2817,  0.2464, -0.1742]]],
       grad_fn=<NativeLayerNormBackward0>)

In [29]:
lmmd_result = lmmodel.electra(torch.tensor([input_ids]))

In [32]:
lmmd_result.last_hidden_state

tensor([[[ 0.0886, -0.1974, -0.0859,  ...,  0.2817,  0.2463, -0.1742],
         [-0.2716, -0.8261,  0.3865,  ...,  0.3734,  0.3926, -0.0354],
         [-0.1466, -0.2909,  0.4556,  ..., -0.0473, -0.0970,  0.0106],
         ...,
         [ 0.7419, -0.2018,  0.0279,  ...,  0.1788, -0.0729, -0.1511],
         [-0.2403,  0.1839, -0.0043,  ..., -0.1352,  0.0159,  0.0375],
         [ 0.0886, -0.1975, -0.0859,  ...,  0.2817,  0.2464, -0.1742]]],
       grad_fn=<NativeLayerNormBackward0>)

In [10]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="monologg/koelectra-base-v3-generator",
    tokenizer="monologg/koelectra-base-v3-generator"
)

fill_mask("나는 {} 밥을 먹었다.".format(fill_mask.tokenizer.mask_token))

[{'sequence': '[CLS] 나는 방금 밥을 먹었다. [SEP]',
  'score': 0.09365378320217133,
  'token': 13195,
  'token_str': '방금'},
 {'sequence': '[CLS] 나는 이렇게 밥을 먹었다. [SEP]',
  'score': 0.043304841965436935,
  'token': 15286,
  'token_str': '이렇게'},
 {'sequence': '[CLS] 나는 어제 밥을 먹었다. [SEP]',
  'score': 0.042306043207645416,
  'token': 8108,
  'token_str': '어제'},
 {'sequence': '[CLS] 나는 오늘 밥을 먹었다. [SEP]',
  'score': 0.03583071753382683,
  'token': 6451,
  'token_str': '오늘'},
 {'sequence': '[CLS] 나는 그날 밥을 먹었다. [SEP]',
  'score': 0.029736625030636787,
  'token': 8961,
  'token_str': '그날'}]

In [None]:
from transformers import pipeline
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
fill_mask = pipeline(
    "fill-mask",
    model='roberta-base'
)

In [None]:
fill_mask("The man worked as a <mask>.")

In [51]:
import torch.nn as nn

embedding = nn.Embedding(7, 3)
input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 5]])
a = embedding(input)

In [50]:
a.shape

torch.Size([2, 4, 3])

In [41]:
a[0]

tensor([[-0.1408, -0.3446,  1.1434],
        [-0.4501, -0.4836, -0.2267],
        [ 0.4594, -0.5137, -0.1693],
        [-1.5105,  0.4170, -0.4267]], grad_fn=<SelectBackward0>)