In [110]:
import pandas as pd, os, numpy as np, torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from pathlib import Path
from collections import OrderedDict
import torch.nn.functional as F

Path.ls= lambda x: [i.name for i in os.scandir(x)]

In [10]:
small_datasets=Path(r'd:\datasets\small_datasets')
small_datasets.ls()

['adult',
 'car.csv',
 'doc',
 'dont-overfit-ii',
 'he',
 'house-prices-advanced-regression-techniques',
 'image-files',
 'katta',
 'titanic.csv',
 'train_1.csv',
 'twitter-emoji-prediction.zip']

In [67]:
df=pd.read_csv(small_datasets/'car.csv', header=None); df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [141]:
df=df.reindex(np.random.permutation(df.index))

In [142]:
df.shape

(1728, 7)

In [143]:
y=df.iloc[:, 6]
x=df.iloc[:, :6]

In [144]:
x.shape, y.shape

((1728, 6), (1728,))

In [145]:
x_train, y_train, x_val, y_val= x[:1200], y[:1200], x[1201:], y[1201:]

In [146]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5
41,vhigh,vhigh,3,4,med,high
899,med,vhigh,3,2,big,high
842,high,low,5more,2,med,high
473,high,vhigh,3,4,med,high
1241,med,low,3,more,big,high


In [147]:
x_train.shape, x_val.shape

((1200, 6), (527, 6))

In [148]:
x_train.dtypes

0    object
1    object
2    object
3    object
4    object
5    object
dtype: object

In [149]:
class Data:
    def __init__(self, x, y):
        self.x, self.y= x.copy(), y.copy()

In [150]:
train, val= Data(x_train, y_train), Data(x_val, y_val)

In [151]:
def frames_encoder(df):
    cat_dict={}
    for col in df.columns:
        if df[col].dtype=='object':
            cat_dict[col]=OrderedDict(map(reversed, enumerate(df[col].unique())))
    return cat_dict

def encode_data(train, val):
    d= frames_encoder(train.x)
    for col in train.x.columns:
        if train.x[col].dtype=='O':
            train.x.loc[:,col].replace(d[col], inplace=True)
            val.x.loc[:,col].replace(d[col], inplace=True)
    return d

In [152]:
encode_data(train, val)

{0: OrderedDict([('vhigh', 0), ('med', 1), ('high', 2), ('low', 3)]),
 1: OrderedDict([('vhigh', 0), ('low', 1), ('med', 2), ('high', 3)]),
 2: OrderedDict([('3', 0), ('5more', 1), ('4', 2), ('2', 3)]),
 3: OrderedDict([('4', 0), ('2', 1), ('more', 2)]),
 4: OrderedDict([('med', 0), ('big', 1), ('small', 2)]),
 5: OrderedDict([('high', 0), ('med', 1), ('low', 2)])}

In [155]:
def encode_y(train, val):
    d=OrderedDict(map(reversed, enumerate(train.y.unique())))
    train.y.replace(d, inplace=True), val.y.replace(d, inplace=True)
    return d

In [156]:
encode_y(train, val)

OrderedDict([('unacc', 0), ('vgood', 1), ('acc', 2), ('good', 3)])

In [159]:
cat_levels = [len(np.unique(train.x[col].values)) for col in train.x.columns]
cat_levels

[4, 4, 4, 3, 3, 3]

In [172]:
class TabularModel(nn.Module):
    def __init__(self, cat_levels, emb_size=5, n_class=4):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, emb_size) for c in cat_levels])
        self.lin1 = nn.Linear(6*emb_size, 20)
        self.lin2 = nn.Linear(20, n_class)
        self.bn = nn.BatchNorm1d(20)
        self.emb_drop = nn.Dropout(0.2)
        self.drops = nn.Dropout(0.2)

    def forward(self, x):
        # try to write a shorter code
        e0 = self.embs[0](x[:,0])
        e1 = self.embs[1](x[:,1])
        e2 = self.embs[2](x[:,2])
        e3 = self.embs[3](x[:,3])
        e4 = self.embs[4](x[:,4])
        e5 = self.embs[5](x[:,5])
        x = torch.cat([e0, e1, e2, e3, e4, e5], 1)
        x = self.emb_drop(x)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn(x)
        x = self.lin2(x)
        return x

In [174]:
embs=nn.ModuleList([nn.Embedding(c, 5) for c in cat_levels])

In [175]:
embs

ModuleList(
  (0): Embedding(4, 5)
  (1): Embedding(4, 5)
  (2): Embedding(4, 5)
  (3): Embedding(3, 5)
  (4): Embedding(3, 5)
  (5): Embedding(3, 5)
)

In [176]:
def get_optimizer(model, lr = 0.01, wd = 0.0):
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    return optim

In [177]:
x_train = torch.LongTensor(train.x.values)
y_train = torch.LongTensor(train.y.values)

In [178]:
x_val=torch.LongTensor(val.x.values)
y_val=torch.LongTensor(val.y.values)

In [179]:
def val_metric(model):
    model.eval()
    correct = 0
    out = model(x_val)
    loss = F.cross_entropy(out, y_val)
    pred = torch.max(out, 1)[1]
    correct += (pred == y_val).float().sum().item()
    return loss, correct/y_val.shape[0]

In [180]:
model = TabularModel(cat_levels)
optim = get_optimizer(model, lr = 0.01, wd = 0.0)

In [181]:
def train_model(model, optim, epochs=5):
    for i in range(epochs):
        model.train()
        out = model(x_train)
        loss = F.cross_entropy(out, y_train)   
        optim.zero_grad()
        loss.backward()
        optim.step()
        val_loss, val_acc = val_metric(model)
        if i % 15 == 0: 
            print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss, val_loss, val_acc))

In [182]:
train_model(model, optim, epochs=150)

train loss 1.461 val loss 1.160 and accuracy 0.694
train loss 0.779 val loss 0.634 and accuracy 0.776
train loss 0.494 val loss 0.399 and accuracy 0.871
train loss 0.344 val loss 0.290 and accuracy 0.901
train loss 0.278 val loss 0.242 and accuracy 0.918
train loss 0.264 val loss 0.200 and accuracy 0.943
train loss 0.235 val loss 0.179 and accuracy 0.941
train loss 0.224 val loss 0.177 and accuracy 0.939
train loss 0.221 val loss 0.161 and accuracy 0.941
train loss 0.216 val loss 0.140 and accuracy 0.947


In [185]:
path_data=Path(r'd:\datasets\new_datasets\sberbank-russian-housing-market')
path_data.ls()

['data_dictionary.txt',
 'macro.csv.zip',
 'New folder',
 'sample_submission.csv.zip',
 'test.csv',
 'train.csv',
 '__MACOSX']

In [186]:
df=pd.read_csv(path_data/'train.csv');df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [187]:
df.shape

(30471, 292)

In [190]:
mis=pd.DataFrame(df.isna().mean()); mis.head()

Unnamed: 0,0
id,0.0
timestamp,0.0
full_sq,0.0
life_sq,0.209478
floor,0.005481


In [192]:
mis[mis.iloc[:,0]>0]

Unnamed: 0,0
life_sq,0.209478
floor,0.005481
max_floor,0.314135
material,0.314135
build_year,0.44649
num_room,0.314135
kitch_sq,0.314135
state,0.44498
preschool_quota,0.219487
school_quota,0.219389
