### Embeddings  - building out categorical embeddings from:  
* https://www.fast.ai/2018/04/29/categorical-embeddings/
* https://medium.com/@hiromi_suenaga/deep-learning-2-part-1-lesson-4-2048a26d58aa
* https://forums.fast.ai/t/understanding-columnarmodeldata-from-data-frame-from-rossman/8140
* https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/

In [291]:
from torch.utils.data import DataLoader as torch_dl
from torch.utils.data import Dataset
from torch import  nn
from torch import optim
from torch.nn.init import *
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, scale
import pandas as pd
import numpy as np
import pickle

In [292]:
#directory='c:/users/da1933/desktop/cds/ids_project/'
directory='~/desktop/idsproject/'

df1=pd.read_csv(directory+'wine/winemag-data-130k-v2.csv',index_col=0)
df2=pd.read_csv(directory+'wine/winemag-data_first150k.csv',index_col=0)
df=pd.concat([df1.drop(['taster_name','taster_twitter_handle','title'],axis=1),df2]).reset_index(drop=True)

df['over90']=df['points'].apply(lambda x: 1 if x>=90 else 0)
df['textlen']=df['description'].apply(lambda x: len(x))
df['textlen']=scale(df[['textlen']])
df['price']=df['price'].fillna(np.mean(df['price']))
df['price']=scale(df[['price']])

  # Remove the CWD from sys.path while we load stuff.


In [293]:
continents=pickle.load(open('continents.sav','rb'))
df['continent']=df['country'].map(continents)

categorical_features=['country',
                      'variety',
                      'continent']

for i in categorical_features:
    print(i,df[i].nunique(),'nan: ',sum(df[i].isna()))
    df[i]=df[i].fillna('unknown')

country 50 nan:  68
variety 756 nan:  1
continent 6 nan:  69


In [295]:
df=df.set_index(np.random.permutation(df.index.values)).sort_index()

# Determine split index:
split_index=int(np.round(df.shape[0]*.8))

# Assign training and testing datasets based on split index
train_df=df.iloc[:split_index,:]
test_df=df.iloc[split_index:,:]

In [296]:
#begin with regression problem
#output_feature='points'
#data=df[categorical_features+['textlen','points']]

output_feature='over90'
#data=train_df[categorical_features+['textlen','over90','price']]
#testdata=test_df[categorical_features+['textlen','over90','price']]
testdata=test_df[categorical_features+['over90','price']]
data=train_df[categorical_features+['over90','price']]

In [298]:
class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        """
        Characterizes a Dataset for PyTorch

        Parameters
        ----------

        data: pandas data frame
          The data frame object for the input data. It must
          contain all the continuous, categorical and the
          output columns to be used.

        cat_cols: List of strings
          The names of the categorical columns in the data.
          These columns will be passed through the embedding
          layers in the model. These columns must be
          label encoded beforehand. 

        output_col: string
          The name of the output variable column in the data
          provided.
        """

        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y =  np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns
                          if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X =  np.zeros((self.n, 1))

    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n

    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [299]:
#The nn.Module class is the base class for all neural networks in PyTorch. 
#Our model, FeedForwardNN will subclass the nn.Module class. 
#In the __init__ method of our class, we will initialize the various layers 
#that will be used in the model and the forward method would define the 
#various computations performed in the network.

import torch
import torch.nn as nn
import torch.nn.functional as F


class FeedForwardNN(nn.Module):

    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):

        """
        Parameters
        ----------

        emb_dims: List of two element tuples
          This list will contain a two element tuple for each
          categorical feature. The first element of a tuple will
          denote the number of unique values of the categorical
          feature. The second element will denote the embedding
          dimension to be used for that feature.

        no_of_cont: Integer
          The number of continuous features in the data.

        lin_layer_sizes: List of integers.
          The size of each linear layer. The length will be equal
          to the total number
          of linear layers in the network.

        output_size: Integer
          The size of the final output.

        emb_dropout: Float
          The dropout to be used after the embedding layers.

        lin_layer_dropouts: List of floats
          The dropouts to be used after each linear layer.
        """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                         for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                    lin_layer_sizes[0])

        self.lin_layers =\
         nn.ModuleList([first_lin_layer] +\
              [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
               for i in range(len(lin_layer_sizes) - 1)])

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                      output_size)
        nn.init.kaiming_normal(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                        for size in lin_layer_sizes])

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                      for size in lin_layer_dropouts])

    def forward(self, cont_data, cat_data):

        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i])
               for i,emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1) 
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in\
            zip(self.lin_layers, self.droput_layers, self.bn_layers):
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)

        return x

In [300]:
# After creating the network architecture we have to run the training loop. 
# For the purpose of demonstration, I am using the dataset from the Kaggle 
# competition – House Prices: Advanced Regression Techniques.

# We need to instantiate an object of the TabularData class we created earlier.
# But before that, we need to label encode the categorical features. 
# For this, we will be using sklearn.preprocessing.LabelEncoder.

from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for cat_col in categorical_features:
        label_encoders[cat_col] = LabelEncoder()
        data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

#Let’s instantiate an object of the TabularDataset class.
dataset = TabularDataset(data=data, cat_cols=categorical_features,
                             output_col=output_feature)
testdataset = TabularDataset(data=data, cat_cols=categorical_features,
                             output_col=output_feature)

# In order to run the training loop, we need to create a torch.util.data.Dataloader
# object. It serves the following purpose –
### creates batches from the dataset
### shuffles the data
### loads the data in parallel

batchsize = 128
dataloader = DataLoader(dataset, batchsize, shuffle=True)

# Now that we have created the basic data structure to run the training loop,
# we need to instantiate a model object of the FeedForwadNN class created earlier.
# This class requires a list of tuples, where each tuple represents a pair of total 
# and the embedding dimension of a categorical variable.

cat_dims = [int(data[col].nunique()) for col in categorical_features]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [None]:
l1=[i**2 for i in range(7,20,3)]
l2=[i**2 for i in range(7,20,3)]

do=[.01,.05,.1]

# The number of continuous features used is 1. The hidden layer dimension is 50 and 100
# for the first and second layers respectively. The embedding dropout used is 0.04. 
# The hidden layer dropouts are 0.001 and 0.01.
acc=[]

for lin1 in l1:
    for lin2 in l2:
        for d1 in do:
            for d2 in do:
                model = FeedForwardNN(emb_dims, no_of_cont=2, lin_layer_sizes=[lin1, lin2],\
                                      output_size=1, emb_dropout=d1,\
                                      lin_layer_dropouts=[d1,d2])

                # Finally, let’s run the training loop –

                no_of_epochs = 20
                criterion = nn.MSELoss()    # Regression
                #criterion = nn.HingeEmbeddingLoss() #??????
                optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
                    
                for epoch in range(no_of_epochs):
                    for y, cont_x, cat_x in dataloader:

                        cat_x = cat_x
                        cont_x = cont_x
                        y  = y

                        # Forward Pass
                        preds = model(Variable(cont_x), cat_x)
                        loss = criterion(preds, Variable(y))

                        # Backward Pass and Optimization
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                    if epoch%5==0:
                        print(epoch,loss)

                testcat=testdataset.cat_X[:100000,:]
                testcont=testdataset.cont_X[:100000,:]
                testy=testdataset.y[:100000]

                model.eval()

                testpreds=model(Variable(torch.from_numpy(testcont)),Variable(torch.from_numpy(testcat)))

                accuracy=sum(np.round(testpreds.data.numpy().flatten('C'))==testy.flatten('C'))/len(testy)

                print('Accuracy:',accuracy)

                acc.append((lin1,lin2,d1,d2,accuracy))

In [290]:
acc[np.argmax([i[-1] for i in acc])]

(169, 100, 0.1, 0.01, 0.76611)

In [304]:
model = FeedForwardNN(emb_dims, no_of_cont=1, lin_layer_sizes=[169, 100],\
                      output_size=1, emb_dropout=.1,\
                      lin_layer_dropouts=[.1,.01])

# Finally, let’s run the training loop –

no_of_epochs = 5
criterion = nn.MSELoss()    # Regression
#criterion = nn.HingeEmbeddingLoss() #??????
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in dataloader:

        cat_x = cat_x
        cont_x = cont_x
        y  = y

        # Forward Pass
        preds = model(Variable(cont_x), cat_x)
        loss = criterion(preds, Variable(y))

        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(epoch,loss)

testcat=testdataset.cat_X
testcont=testdataset.cont_X
testy=testdataset.y

model.eval()

testpreds=model(Variable(torch.from_numpy(testcont)),Variable(torch.from_numpy(testcat)))

accuracy=sum(np.round(testpreds.data.numpy().flatten('C'))==testy.flatten('C'))/len(testy)

print('Accuracy:',accuracy)

0 Variable containing:
 0.1727
[torch.FloatTensor of size 1]

1 Variable containing:
 0.2118
[torch.FloatTensor of size 1]

2 Variable containing:
 0.1650
[torch.FloatTensor of size 1]

3 Variable containing:
 0.1879
[torch.FloatTensor of size 1]

4 Variable containing:
 0.1766
[torch.FloatTensor of size 1]

Accuracy: 0.7630172525042164
