In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime, nltk, warnings
import matplotlib.cm as cm
import itertools
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from wordcloud import WordCloud, STOPWORDS
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from IPython.display import display, HTML
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from mpl_toolkits.mplot3d import axes3d, Axes3D
import matplotlib.animation as animation
init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
%matplotlib inline

In [3]:
#__________________
# read the datafile
df = pd.read_csv('../input/ecommerce-data/data.csv',encoding="ISO-8859-1",
                         dtype={'CustomerID': str,'InvoiceID': str})
print('Dataframe dimensions:', df.shape)
#______
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
#__________________
# show first lines
display(df[:5])

Dataframe dimensions: (541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [4]:
df.dropna(axis = 0, subset = ['CustomerID'], inplace = True)
print('Dataframe dimensions:', df.shape)
print('Entrées dupliquées: {}'.format(df.duplicated().sum()))
df.drop_duplicates(inplace = True)

Dataframe dimensions: (406829, 8)
Entrées dupliquées: 5225


In [5]:
df['year'] = df['InvoiceDate'].dt.year
df['month'] = df['InvoiceDate'].dt.month
df['day'] = df['InvoiceDate'].dt.day
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,year,month,day
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,2010,12,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2010,12,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,2010,12,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2010,12,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2010,12,1


In [6]:
df['Log_UnitPrice']=(df['UnitPrice'] - df['UnitPrice'].min()+ 1).transform(np.log)
df.sample(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,year,month,day,Log_UnitPrice
213810,555558,21933,PINK VINTAGE PAISLEY PICNIC BAG,2,2011-06-05 14:06:00,1.65,14159,United Kingdom,2011,6,5,0.97456
75649,542607,22243,5 HOOK HANGER RED MAGIC TOADSTOOL,2,2011-01-30 13:48:00,1.65,13148,United Kingdom,2011,1,30,0.97456
462833,576014,22940,FELTCRAFT CHRISTMAS FAIRY,2,2011-11-13 14:29:00,4.25,17218,United Kingdom,2011,11,13,1.658228
381539,569866,20766,GARDEN PATH SKETCHBOOK,2,2011-10-06 14:50:00,3.75,12757,Portugal,2011,10,6,1.558145
131652,547580,22722,SET OF 6 SPICE TINS PANTRY DESIGN,4,2011-03-24 10:48:00,3.95,13488,United Kingdom,2011,3,24,1.599388


In [7]:
df['Geography'] = np.select([df.Country == 'United Kingdom'], 
                               ['UK'], 
                               default='Rest of the World')
                               
df['Price Ranges'] = pd.cut(df['UnitPrice'], bins = [0, 100, 200, 38970], labels=["Low", "Medium", "High"])
df.sample(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,year,month,day,Log_UnitPrice,Geography,Price Ranges
275707,561006,23201,JUMBO BAG ALPHABET,10,2011-07-22 15:02:00,2.08,16161,United Kingdom,2011,7,22,1.12493,UK,Low
428874,C573502,23389,SPACEBOY MINI BACKPACK,-2,2011-10-31 12:02:00,4.15,14808,United Kingdom,2011,10,31,1.638997,UK,Low
17590,537769,22569,FELTCRAFT CUSHION BUTTERFLY,1,2010-12-08 12:17:00,3.75,15021,United Kingdom,2010,12,8,1.558145,UK,Low
388395,570427,21933,PINK VINTAGE PAISLEY PICNIC BAG,10,2011-10-10 13:56:00,1.65,17462,United Kingdom,2011,10,10,0.97456,UK,Low
198632,554058,22354,RETROSPOT PADDED SEAT CUSHION,2,2011-05-20 16:04:00,3.75,17757,United Kingdom,2011,5,20,1.558145,UK,Low


In [8]:
size = len(df.Description)
encoder, scaler = LabelEncoder(), MinMaxScaler()
aut = encoder.fit_transform(df.Description) 
rat = scaler.fit_transform(df[['UnitPrice']])

In [9]:
class Latent_Embed(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Latent_Embed, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(3, 2)
        self.linear2 = nn.Linear(2, 1)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        out = F.relu(self.linear1(embeds))
        out = F.relu(self.linear2(out))
        return out


aut_t = torch.tensor(aut)#.long()#.to(torch.int64)
rat_t = torch.tensor(rat)#.long()#.to(torch.int64)
loss_function = nn.MSELoss()
model = Latent_Embed(size, 3)
optimizer = optim.SGD(model.parameters(), lr=0.0001)

for epoch in range(2):
    total_loss = 0
    for context, target in zip(aut_t, rat_t):
        model.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs.double(), target.view(1).double())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print("Loss: ", total_loss/len(aut_t))

Loss:  0.001889391144885278
Loss:  1.1804152379611065e-05


In [10]:
embedding_weights = pd.DataFrame(model.embeddings.weight.detach().numpy())
embedding_weights.columns = ['X1','X2','X3']
embedding_weights

Unnamed: 0,X1,X2,X3
0,-1.342568,-1.368914,0.391239
1,0.565517,0.533157,2.327102
2,-0.344489,0.427260,-0.400409
3,-0.697449,-1.012041,0.519893
4,-0.494447,2.363062,0.076183
...,...,...,...
401599,1.097833,1.882981,-0.434406
401600,-1.185280,0.281107,-0.660715
401601,0.452597,0.076789,0.449076
401602,0.512595,-0.622056,-0.820739


In [11]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(df.drop(['Description', 'StockCode', 'InvoiceNo', 'InvoiceDate', 'Country', 'Geography', 'Price Ranges'], axis=1))
PCA_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2', 'PC3'])
PCA_df.head()

Unnamed: 0,PC1,PC2,PC3
0,2568.842554,4.85826,-0.449211
1,2568.842398,4.858593,0.391022
2,2568.841486,2.85834,-0.24839
3,2568.842398,4.858593,0.391022
4,2568.842398,4.858593,0.391022


In [12]:
df['pca_dec_1'] = PCA_df['PC1']
df['pca_dec_2'] = PCA_df['PC2']
df['pca_dec_3'] = PCA_df['PC3']

In [13]:
df['embed_dec_1'] = embedding_weights['X1']
df['embed_dec_2'] = embedding_weights['X2']
df['embed_dec_3'] = embedding_weights['X3']

In [14]:
set_entrainement = df[df['InvoiceDate'].dt.date < datetime.date(2011,10,1)]
basket_price = set_entrainement.copy(deep = True)

In [15]:
set_entrainement.to_csv('../data/features_no_live_data.csv', index=False)

In [16]:
raw_live_data = df[df['InvoiceDate'].dt.date >= datetime.date(2011,10,1)]
print(df.shape)
print(set_entrainement.shape)
print(raw_live_data.shape)

(401604, 20)
(270062, 20)
(131542, 20)


In [17]:
raw_live_data.to_csv('../data/features_raw_live_data.csv', index=False)