In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim.lr_scheduler import StepLR
from torchvision import transforms
from torchvision.models import resnet18


import nltk
from nltk.corpus import stopwords

import os
import time

In [None]:
data_root = os.path.join(os.getcwd(), 'datasets')

In [None]:
engineered_features = pd.read_csv(os.path.join(data_root, 'engineered_features.csv'))
train_orig = pd.read_csv(os.path.join(data_root, 'train.csv'))
test_orig = pd.read_csv(os.path.join(data_root, 'test.csv'))

In [None]:
train_df = train_orig.merge(engineered_features, on='user_id', how='left')
test_df = test_orig.merge(engineered_features, on='user_id', how='left')

In [None]:
train_df = train_df.head(50).drop(['item_id', 'user_id'], axis=1)
test_df = test_df.head(50).drop(['item_id', 'user_id'], axis=1)

In [None]:
agg_cols = list(engineered_features.columns)[1:]

In [None]:
categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]

In [None]:
def transform(df, categories):
    # Fill missing values
    df['description'].fillna('unknowndescription', inplace=True)
    df['title'].fillna('unknowntitle', inplace=True)
    
    df['price'].fillna(df['price'].mean(), inplace=True)
    df['image'].fillna('noimage', inplace=True)
    
    for col in agg_cols:
        df[col].fillna(-1, inplace=True)
        
    for col in categorical:
        df.loc[:, col] = df[col].fillna('').astype(str)
    
    # Engineer weekday feature
    df['weekday'] = pd.to_datetime(df['activation_date']).dt.day.fillna(0)
    df['month_num'] = pd.to_datetime(df['activation_date']).dt.day.fillna(0)
    df.drop(['activation_date'], axis=1, inplace=True)
    
    # Count number of words and unique words in text fields
    for col in ['description', 'title']:
        df['num_words_' + col] = df[col].apply(lambda comment: len(comment.split())).fillna(0)
        df['num_unique_words_' + col] = df[col].apply(lambda comment: 
                                                      len(set(w for w in comment.split()))).fillna(0)
    
    # Compute ratio  of words to unique words
    df['words_vs_unique_title'] = (df['num_unique_words_title'] / 
                                   df['num_words_title'] * 100).fillna(0)
    df['words_vs_unique_description'] = (df['num_unique_words_description'] / 
                                         df['num_words_description'] * 100).fillna(0)
    
    # TF-IDF 
    title_vectorizer = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True)
    
    desc_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                            lowercase=True, ngram_range=(1, 2),
                                            max_features=15000)
    
    title_vecs = title_vectorizer.fit_transform(df['title'])
    desc_vecs = desc_vectorizer.fit_transform(df['description'])

    title_vecs = pd.DataFrame(title_vecs.todense(), columns=title_vectorizer.get_feature_names())
    desc_vecs = pd.DataFrame(desc_vecs.todense(), columns=desc_vectorizer.get_feature_names())
    
    # one hot encoding
    encoder = OneHotEncoder(drop='first')
    
    encoded_vecs = encoder.fit_transform(df[categories + ['weekday', 'month_num']])
    encoded_vecs = pd.DataFrame(encoded_vecs.todense(), columns=encoder.get_feature_names())
    
    df.drop(categories+['description', 'title'], axis=1, inplace=True)
    
    df = pd.concat([df, title_vecs, desc_vecs, encoded_vecs], axis=1)
    
    return df

In [None]:
train_df = transform(train_df, categorical)
test_df = transform(test_df, categorical)

In [None]:
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [None]:
print(train_df.shape) 
print(valid_df.shape) 
print(test_df.shape)

In [None]:
class AvitoDataset(Dataset):
    """Avito Torch dataset."""

    def __init__(self, df, data_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        df = df.reset_index()
        self.images = df['image']
        self.deal_probs = df['deal_probability']
        self.features = df.drop(['deal_probability', 'image'], axis=1)
        
        self.data_dir = data_dir
        self.transform = transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):

        img_name = os.path.join(self.data_dir, self.images[idx])
        
        #image = io.imread(img_name)
       
        features = torch.tensor(self.features.iloc[idx])
        features = features.view(1, 1, -1)
        gt = torch.tensor(self.deal_probs[idx])

        if self.transform:
            image = self.transform(image)

        return features, gt

In [None]:
batch_size = 32
num_workers = 8
num_features = train_df.shape[1] - 1

In [None]:
train = AvitoDataset(train_df, data_root)
valid = AvitoDataset(valid_df, data_root)

datasets = {'Train': train, 'Validation': valid}

In [None]:
dataloaders = {x: DataLoader(datasets[x], batch_size=batch_size, shuffle=True, num_workers = num_workers)
              for x in ['Train', 'Validation']}

In [None]:
# Here we're defining what component we'll use to train this model
# We want to use the GPU if available, if not we use the CPU

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
class NN(nn.Module):
  
    def __init__(self):
        super(NN, self).__init__()

        self.block1 = nn.Sequential(
            nn.Linear(num_features, 1024),
            nn.ReLU(),
            nn.BatchNorm2d(1024),
            nn.Dropout(0.5)
        )
        
        
        self.block2 =  nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm2d(512),
            nn.Dropout(0.5)
        )
        
        # Mimic the second block here, except have this block extract 128 features
        self.fc =  nn.Linear(512, 1)
        
    
    def forward(self, x):
        print(x.size())
        x = self.block1(x)
        x = self.block2(x)
        x = self.fc(x)
        
        return F.sigmoid(x)

In [None]:
model = NN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
epochs = 10
model.to(device)

In [None]:
def run_epoch(epoch, model, dataloaders, device, phase):
    running_loss = 0.0
    running_corrects = 0

    if phase == 'Train':
        model.train()
    else:
        model.eval()

    # Looping through batches
    for i, (inputs, labels) in enumerate(dataloaders[phase]):
    
        # ensures we're doing this calculation on our GPU if possible
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero parameter gradients
        optimizer.zero_grad()
    
        # Calculate gradients only if we're in the training phase
        with torch.set_grad_enabled(phase == 'Train'):
      
            # This calls the forward() function on a batch of inputs
            outputs = model(inputs)

            # Calculate the loss of the batch
            loss = criterion(outputs, labels)
            rmse = np.sqrt(loss.item())

            # Adjust weights through backpropagation if we're in training phase
            if phase == 'Train':
                loss.backward()
                optimizer.step()

        # Document statistics for the batch
        running_loss += loss.item() * inputs.size(0)
        running_rmse += rmse * inputs.size(0)
    
    # Calculate epoch statistics
    epoch_loss = running_loss / datasets[phase].__len__()
    epoch_acc = running_rmse / datasets[phase].__len__()

    return epoch_loss, epoch_acc


In [None]:
def train(model, criterion, optimizer, num_epochs, dataloaders, device):
    start = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    
    print('| Epoch\t | Train Loss\t| Train Acc\t| Valid Loss\t| Valid Acc\t| Epoch Time |')
    print('-' * 86)
    
    # Iterate through epochs
    for epoch in range(num_epochs):
        
        epoch_start = time.time()
       
        # Training phase
        train_loss, train_acc = run_epoch(epoch, model, dataloaders, device, 'Train')
        
        # Validation phase
        val_loss, val_acc = run_epoch(epoch, model, dataloaders, device, 'Validation')
        
        epoch_time = time.time() - epoch_start
           
        # Print statistics after the validation phase
        print("| {}\t | {:.4f}\t| {:.4f}\t| {:.4f}\t| {:.4f}\t| {:.0f}m {:.0f}s     |"
                      .format(epoch + 1, train_loss, train_acc, val_loss, val_acc, 
                              epoch_time // 60, epoch_time % 60))

        # Copy and save the model's weights if it has the best accuracy thus far
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = model.state_dict()

    total_time = time.time() - start
    
    print('-' * 74)
    print('Training complete in {:.0f}m {:.0f}s'.format(total_time // 60, total_time % 60))
    print('Best validation accuracy: {:.4f}'.format(best_acc))

    # load best model weights and return them
    model.load_state_dict(best_model_wts)
    return model

In [None]:
model = train(model, criterion, optimizer, epochs, dataloaders, device)