# ws 07

- download xls data (Online Retail.xlxs)  from : https://archive.ics.uci.edu/ml/machine-learning-databases/00352/

In [None]:
!pip install tqdm
!pip install openpyxl

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm 
from gensim.models import Word2Vec 
from gensim.models.callbacks import CallbackAny2Vec   

In [None]:
cols = 'a:d, g' #['a:d,']

df = pd.read_excel('Online Retail.xlsx', usecols=cols, 
                   dtype={'CustomerID':str,'InvoiceNo':str})  

In [None]:
df = pd.read_excel('Online Retail.xlsx')

# Explore

In [None]:
df.head()

In [None]:
df.sample(5)

In [None]:
df.tail()

In [None]:
df[df.Quantity < 0].count()
df.groupby('Quantity').size()

In [None]:
df[df.Quantity < -100].sort_values('Quantity', ascending=True).head()

In [None]:
df.dtypes

In [None]:
df.describe().round(2)

In [None]:
df.shape

In [None]:
df.CustomerID.value_counts().sort_values(ascending=False).head(10)

In [None]:
df.groupby('CustomerID').count().sort_values('InvoiceNo',
                                             ascending=False).InvoiceNo.head(10)

In [None]:
df.InvoiceNo.value_counts().sort_values(ascending=False).head(10)

In [None]:
df.isnull().sum()

In [None]:
df.groupby('InvoiceNo').size()

In [None]:
df.groupby('CustomerID').size()

# Preprocess

In [None]:
df[df.Quantity < 0].count() 


In [None]:
df = df[df.Quantity > 0] 
df.shape

In [None]:
df[df.Quantity < 1].count()

In [None]:
df.isnull().sum()

In [None]:
df['CustomerID'] = df['CustomerID'].astype(str)
df['StockCode'] = df['StockCode'].astype(str)
df['InvoiceNo'] = df['InvoiceNo'].astype(str)

In [None]:
df.dtypes

In [None]:
df.head()

check invoice

In [None]:
df_invo1 = df.groupby('InvoiceNo').count().Quantity.sort_values()
df_invo1

In [None]:
frame = {'InvoiceNo': df_invo1.index , 'Count': df_invo1.values }
  
res = pd.DataFrame(frame)
invoices_small = res[res.Count <= 1].InvoiceNo.tolist()
invoices_small[:10]

In [None]:
df = df[df.InvoiceNo.apply(lambda x: x not in invoices_small)]  

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.groupby('InvoiceNo').size().sort_values()

In [None]:
df.head()

prepare dataset using InvoiceNo

In [None]:
customers = df.InvoiceNo.unique().tolist()
print(len(customers))
customers[:6]

In [None]:
# import random
# random.shuffle(customers)

customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

train_df = df[df['InvoiceNo'].isin(customers_train)]  
validation_df = df[~df['InvoiceNo'].isin(customers_train)]

In [None]:
customers_train[:8]

In [None]:
train_df.head()

In [None]:
purchases_train = []

# for i in customers_train:
for i in tqdm(customers_train):
    temp = train_df[train_df.InvoiceNo == i]["StockCode"].tolist()
    purchases_train.append(temp)

Train

In [None]:
from IPython.display import clear_output

class MonitorCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        clear_output(wait=True)
        print('Start ...')

    def on_epoch_begin(self, model):
        pass
#         print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        clear_output(wait=True)
        print('Epoch #{}/{} '.format(self.epoch+1, model.epochs) )#
        self.epoch += 1

monitor = MonitorCallback() 


model = Word2Vec(purchases_train, vector_size=50, window=5, 
                 sg=1, workers=4, epochs=40, callbacks=[monitor])


In [None]:
model.save("amazon_store.model")


In [None]:
print(model)

In [None]:
model.wv.index_to_key[:6]

In [None]:
model.wv['22423']

In [None]:
len(purchases_train)

In [None]:

model.wv.most_similar('22613', topn=5) 

In [None]:
products = train_df[["StockCode", "Description"]]

# products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

In [None]:
products[:5]

In [None]:
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

products_dict['22613']

In [None]:
sku_now = '22613' 

similars = model.wv.most_similar(sku_now, topn=7) # 

print('Shopping:', sku_now, products_dict[sku_now][0])
print('--------------------------------------------')

for i in similars:
    if i[1] > 0.6:
        print('{:6} {:36} {:.3f}'.format(i[0], products_dict[i[0]][0], i[1]))

In [None]:
sks = ['84997B','22630']
similars = model.wv.most_similar(sks, topn=5) #  

print('Shopping:', sks, products_dict[sks[0]],
      products_dict[sks[1]] )
print('--------------------------------------------')

for i in similars:
    if i[1] > 0.6:
        print('{:6} {:36} {:.3f}'.format(i[0], products_dict[i[0]][0], i[1]))


In [None]:
products_dict['22534']
products_dict['23355']

In [None]:
try:
    vec_cameroon = model.wv['man']
except KeyError:
    print("This word does not appear in this model")

## Plot

In [None]:
labels = np.asarray(model.wv.index_to_key) 
labels.shape

In [None]:
labels[:10]

In [None]:
vectors = np.asarray(model.wv.vectors)
vectors.shape

In [None]:
from sklearn.decomposition import PCA

lst_chk = ['22613','20780','23355','71053','90191','82482','15036','15044C'] 

pca = PCA(n_components=2)
result = pca.fit_transform(vectors)

plt.figure(figsize=(6.5+0, 4.5+0))
plt.scatter(result[:, 0], result[:, 1], s=10, alpha=.8, c='c')

for i, label in enumerate(labels):
    if label in lst_chk:
        plt.scatter(result[i][0], result[i][1], s=40, c='b')
        plt.annotate(label, xy=(result[i, 0]+.01, result[i, 1]+.01))

plt.tight_layout()
plt.show()