In [110]:
import pandas as pd, os, numpy as np, torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from pathlib import Path
from collections import OrderedDict
import torch.nn.functional as F

Path.ls= lambda x: [i.name for i in os.scandir(x)]

In [10]:
small_datasets=Path(r'd:\datasets\small_datasets')
small_datasets.ls()

['adult',
 'car.csv',
 'doc',
 'dont-overfit-ii',
 'he',
 'house-prices-advanced-regression-techniques',
 'image-files',
 'katta',
 'titanic.csv',
 'train_1.csv',
 'twitter-emoji-prediction.zip']

In [67]:
df=pd.read_csv(small_datasets/'car.csv', header=None); df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [68]:
df.shape

(1728, 7)

In [69]:
y=df.iloc[:, 6]
x=df.iloc[:, :6]

In [70]:
x.shape, y.shape

((1728, 6), (1728,))

In [71]:
x_train, y_train, x_val, y_val= x[:1200], y[:1200], x[1201:], y[1201:]

In [72]:
x_train.shape, x_val.shape

((1200, 6), (527, 6))

In [73]:
x_train.dtypes

0    object
1    object
2    object
3    object
4    object
5    object
dtype: object

In [74]:
class Data:
    def __init__(self, x, y):
        self.x, self.y= x.copy(), y.copy()

In [75]:
train, val= Data(x_train, y_train), Data(x_val, y_val)

In [76]:
def frames_encoder(df):
    cat_dict={}
    for col in df.columns:
        if df[col].dtype=='object':
            cat_dict[col]=OrderedDict(map(reversed, enumerate(df[col].unique())))
    return cat_dict

def encode_data(train, val):
    d= frames_encoder(train.x)
    for col in train.x.columns:
        if train.x[col].dtype=='O':
            train.x.loc[:,col].replace(d[col], inplace=True)
            val.x.loc[:,col].replace(d[col], inplace=True)
    return d

In [77]:
encode_data(train, val)

{0: OrderedDict([('vhigh', 0), ('high', 1), ('med', 2)]), 1: OrderedDict([('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]), 2: OrderedDict([('2', 0), ('3', 1), ('4', 2), ('5more', 3)]), 3: OrderedDict([('2', 0), ('4', 1), ('more', 2)]), 4: OrderedDict([('small', 0), ('med', 1), ('big', 2)]), 5: OrderedDict([('low', 0), ('med', 1), ('high', 2)])}


{0: OrderedDict([('vhigh', 0), ('high', 1), ('med', 2)]),
 1: OrderedDict([('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]),
 2: OrderedDict([('2', 0), ('3', 1), ('4', 2), ('5more', 3)]),
 3: OrderedDict([('2', 0), ('4', 1), ('more', 2)]),
 4: OrderedDict([('small', 0), ('med', 1), ('big', 2)]),
 5: OrderedDict([('low', 0), ('med', 1), ('high', 2)])}

In [81]:
def encode_y(train, val):
    d=OrderedDict(map(reversed, enumerate(train.y.unique())))
    train.y.replace(d, inplace=True), val.y.replace(d, inplace=True)
    return d

In [82]:
encode_y(train, val)

OrderedDict([('unacc', 0), ('acc', 1), ('vgood', 2), ('good', 3)])

In [88]:
cat_levels = [len(np.unique(train.x[col].values)) for col in train.x.columns]
cat_levels

[3, 4, 4, 3, 3, 3]

In [115]:
class TabularModel(nn.Module):
    def __init__(self, cat_levels, emb_size=5, n_class=4):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, emb_size) for c in cat_levels])
        self.lin1 = nn.Linear(6*emb_size, 20)
        self.lin2 = nn.Linear(20, n_class)
        self.bn = nn.BatchNorm1d(20)
        self.emb_drop = nn.Dropout(0.2)
        self.drops = nn.Dropout(0.2)

    def forward(self, x):
        # try to write a shorter code
        e0 = self.embs[0](x[:,0])
        e1 = self.embs[1](x[:,1])
        e2 = self.embs[2](x[:,2])
        e3 = self.embs[3](x[:,3])
        e4 = self.embs[4](x[:,4])
        e5 = self.embs[5](x[:,5])
        x = torch.cat([e0, e1, e2, e3, e4, e5], 1)
        x = self.emb_drop(x)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn(x)
        x = self.lin2(x)
        return x

In [116]:
model = TabularModel(cat_levels)

In [92]:
embs=nn.ModuleList([nn.Embedding(c, 5) for c in cat_levels])

In [93]:
embs

ModuleList(
  (0): Embedding(3, 5)
  (1): Embedding(4, 5)
  (2): Embedding(4, 5)
  (3): Embedding(3, 5)
  (4): Embedding(3, 5)
  (5): Embedding(3, 5)
)

In [96]:
def get_optimizer(model, lr = 0.01, wd = 0.0):
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    return optim

In [100]:
x_train = torch.LongTensor(train.x.values)
y_train = torch.LongTensor(train.y.values)

In [101]:
def val_metric(model):
    model.eval()
    correct = 0
    out = model(x_val)
    loss = F.cross_entropy(out, y_val)
    pred = torch.max(out, 1)[1]
    correct += (pred == y_val).float().sum().item()
    return loss, correct/y_val.shape[0]

In [102]:
model = TabularModel(cat_levels)
optim = get_optimizer(model, lr = 0.01, wd = 0.0)

In [117]:
def train_model(model, optim, epochs=5):
    for i in range(epochs):
        model.train()
        out = model(x_train)
        loss = F.cross_entropy(out, y_train)   
        optim.zero_grad()
        loss.backward()
        optim.step()
        val_loss, val_acc = val_metric(model)
        if i % 15 == 0: 
            print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss, val_loss, val_acc))

In [118]:
train_model(model, optim, epochs=150)

TypeError: '(slice(None, None, None), 0)' is an invalid key