In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim.lr_scheduler import StepLR
from torchvision import transforms
from torchvision.models import resnet18, resnet34

from PIL import Image

import nltk
from nltk.corpus import stopwords

import os
import shutil
import time

In [2]:
data_root = os.path.join(os.getcwd(), 'datasets')
image_root = os.path.join(data_root, 'images')

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
# loading original and engineered features
engineered_features = pd.read_csv(os.path.join(data_root, 'engineered_features.csv'))
train_orig = pd.read_csv(os.path.join(data_root, 'train.csv'))
test_orig = pd.read_csv(os.path.join(data_root, 'test.csv'))

In [5]:
train_df = train_orig.merge(engineered_features, on='user_id', how='left')
test_df = test_orig.merge(engineered_features, on='user_id', how='left')

In [6]:
# Wont need these columns
train_df = train_df.drop(['item_id', 'user_id'], axis=1)
test_df = test_df.drop(['item_id', 'user_id'], axis=1)

In [7]:
agg_cols = list(engineered_features.columns)[1:]

In [8]:
categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]

In [9]:
def transform(df, agg_cols, categories):
    """Creates some additional features and extracts text features from an ad dataset"""
    
    # Fill missing values
    df['description'].fillna('unknowndescription', inplace=True)
    df['title'].fillna('unknowntitle', inplace=True)
    
    df['price'].fillna(df['price'].mean(), inplace=True)
    df['image'].fillna('noimage', inplace=True)
    
    for col in agg_cols:
        df[col].fillna(-1, inplace=True)
        
    for col in categorical:
        df.loc[:, col] = df[col].fillna('').astype(str)
    
    # Engineer weekday feature
    df['weekday'] = pd.to_datetime(df['activation_date']).dt.day.fillna(0)
    df['month_num'] = pd.to_datetime(df['activation_date']).dt.day.fillna(0)
    df.drop(['activation_date'], axis=1, inplace=True)
    
    # Count number of words and unique words in text fields
    for col in ['description', 'title']:
        df['num_words_' + col] = df[col].apply(lambda comment: len(comment.split())).fillna(0)
        df['num_unique_words_' + col] = df[col].apply(lambda comment: 
                                                      len(set(w for w in comment.split()))).fillna(0)
    
    # Compute ratio  of words to unique words
    df['words_vs_unique_title'] = (df['num_unique_words_title'] / 
                                   df['num_words_title'] * 100).fillna(0)
    df['words_vs_unique_description'] = (df['num_unique_words_description'] / 
                                         df['num_words_description'] * 100).fillna(0)
    
    # TF-IDF for title and description
    title_vectorizer = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True)
    
    desc_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                            lowercase=True, ngram_range=(1, 2),
                                            max_features=15000)
    
    title_vecs = title_vectorizer.fit_transform(df['title'])
    desc_vecs = desc_vectorizer.fit_transform(df['description'])

    title_vecs = pd.DataFrame(title_vecs.todense(), columns=title_vectorizer.get_feature_names())
    desc_vecs = pd.DataFrame(desc_vecs.todense(), columns=desc_vectorizer.get_feature_names())
    
    # one hot encoding
    encoder = OneHotEncoder(drop='first')
    
    encoded_vecs = encoder.fit_transform(df[categories + ['weekday', 'month_num']])
    encoded_vecs = pd.DataFrame(encoded_vecs.todense(), columns=encoder.get_feature_names())
    
    df.drop(categories+['description', 'title'], axis=1, inplace=True)
    
    # Concanenate textual and tabular features
    df = pd.concat([df, title_vecs, desc_vecs, encoded_vecs], axis=1)
    
    return df

In [10]:
train_len = len(train_df)
test_len = len(test_df)

deal_probs = train_df['deal_probability']

In [12]:
# Concatenate training and testing set prior to computing textual features
total_df = pd.concat([train_df.drop(['deal_probability'], axis=1), test_df]).reset_index()

# Engineer tabular and textual features
total_df = transform(total_df,agg_cols, categorical)

In [14]:
# Extract training and testing set after engineering features
train_df = total_df.iloc[:train_len, :].copy()
train_df['deal_probability'] = deal_probs
test_df = total_df.iloc[test_len:, :].copy()

In [15]:
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [16]:
print(train_df.shape) 
print(valid_df.shape) 
print(test_df.shape) 

(45, 4423)
(5, 4423)
(50, 4422)


In [17]:
class AvitoDataset(Dataset):
    """Avito Torch dataset."""

    def __init__(self, df, data_dir, image_dir, image_feature_extractor, transform=None, testing=False):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        df = df.reset_index()
        self.images = df['image']
        
        self.testing = testing
        if not testing:
            self.deal_probs = df['deal_probability']
            
        if testing:
            self.features = df.drop(['image'], axis=1)
        else:
            self.features = df.drop(['deal_probability', 'image'], axis=1)
        
        self.data_dir = data_dir
        self.image_dir = image_dir
        
        self.image_feature_extractor = image_feature_extractor
        
        self.transform = transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        
        img_name = os.path.join(self.image_dir, self.images[idx]) + '.jpg'
        
        if self.images[idx] == 'noimage':
            image = None
        else:
            image = Image.open(img_name)
       
        features = torch.tensor(self.features.iloc[idx])
        
        gt = torch.tensor(0).unsqueeze(0)
        if not self.testing:
            gt = torch.tensor(self.deal_probs[idx]).unsqueeze(0)
        
        if self.transform and image:
            image = self.transform(image).unsqueeze(0)
            
            # Extract 1000 ResNet image features 
            image_features = image_feature_extractor(image).squeeze(0)
            
            # concatenate these to the tabular and text features
            features = torch.cat((features, image_features), dim = 0)
        else:
            features = torch.cat((features, torch.zeros(1000)), dim = 0)
        
        return features, gt

In [18]:
batch_size = 32
num_workers = 8

# Resize each image to 224x 224 for resnet
input_size = (224, 224)

# Tabular and text features, minus the deal prob, plus the 100 image features from ResNet
num_features = train_df.shape[1] - 1 + 1000

In [19]:
torch_transforms = transforms.Compose([transforms.Resize(input_size),
                                       transforms.ToTensor()])

In [20]:
class ResNetExtractor(nn.Module):
    """Pretrained ResNet model that will extract image features"""
    
    def __init__(self):
        super(ResNetExtractor, self).__init__()

        resnet = resnet34(pretrained = True)
        
        # Freeze the entire pretrained network
        for layer in resnet.parameters():
            layer.requires_grad = False
            
        self.feature_extraction = resnet
        
    
    def forward(self, x):
        x = self.feature_extraction(x)
        
        return x

In [21]:
image_feature_extractor = ResNetExtractor()
image_feature_extractor.eval()

ResNetExtractor(
  (feature_extraction): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, aff

In [22]:
train = AvitoDataset(train_df, data_root, image_root, image_feature_extractor, torch_transforms)
valid = AvitoDataset(valid_df, data_root, image_root, image_feature_extractor, torch_transforms)

datasets = {'Train': train, 'Validation': valid}

In [23]:
dataloaders = {x: DataLoader(datasets[x], batch_size=batch_size, shuffle=True, num_workers = num_workers)
              for x in ['Train', 'Validation']}

In [24]:
class NN(nn.Module):
    """Nueral net that takes tabular, text, and image features and predicts deal probability """

    def __init__(self):
        super(NN, self).__init__()

        self.block1 = nn.Sequential(
            nn.Linear(num_features, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.5)
        )
        
        
        self.block2 =  nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.5)
        )
        
        # Mimic the second block here, except have this block extract 128 features
        self.fc =  nn.Linear(512, 1)
        self.sigmoid = torch.sigmoid
        
    
    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        
        return x

In [25]:
model = NN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
epochs = 10
model.to(device)

NN(
  (block1): Sequential(
    (0): Linear(in_features=5422, out_features=1024, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.5, inplace=False)
  )
  (block2): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.5, inplace=False)
  )
  (fc): Linear(in_features=512, out_features=1, bias=True)
)

In [26]:
def run_epoch(epoch, model, dataloaders, device, phase):
    running_loss = 0.0
    running_rmse = 0.0

    if phase == 'Train':
        model.train()
    else:
        model.eval()

    # Looping through batches
    for i, (inputs, labels) in enumerate(dataloaders[phase]):
    
        # ensures we're doing this calculation on our GPU if possible
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero parameter gradients
        optimizer.zero_grad()
    
        # Calculate gradients only if we're in the training phase
        with torch.set_grad_enabled(phase == 'Train'):
      
            # This calls the forward() function on a batch of inputs
            outputs = model(inputs)

            # Calculate the loss of the batch
            loss = criterion(outputs, labels)
            rmse = np.sqrt(loss.item())

            # Adjust weights through backpropagation if we're in training phase
            if phase == 'Train':
                loss.backward()
                optimizer.step()

        # Document statistics for the batch
        running_loss += loss.item() * inputs.size(0)
        running_rmse += rmse * inputs.size(0)
    
    # Calculate epoch statistics
    epoch_loss = running_loss / datasets[phase].__len__()
    epoch_acc = running_rmse / datasets[phase].__len__()

    return epoch_loss, epoch_acc

In [27]:
def train(model, criterion, optimizer, num_epochs, dataloaders, device):
    start = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    
    print('| Epoch\t | Train Loss\t| Train Acc\t| Valid Loss\t| Valid Acc\t| Epoch Time |')
    print('-' * 86)
    
    # Iterate through epochs
    for epoch in range(num_epochs):
        
        epoch_start = time.time()
       
        # Training phase
        train_loss, train_acc = run_epoch(epoch, model, dataloaders, device, 'Train')
        
        # Validation phase
        val_loss, val_acc = run_epoch(epoch, model, dataloaders, device, 'Validation')
        
        epoch_time = time.time() - epoch_start
           
        # Print statistics after the validation phase
        print("| {}\t | {:.4f}\t| {:.4f}\t| {:.4f}\t| {:.4f}\t| {:.0f}m {:.0f}s     |"
                      .format(epoch + 1, train_loss, train_acc, val_loss, val_acc, 
                              epoch_time // 60, epoch_time % 60))

        # Copy and save the model's weights if it has the best accuracy thus far
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = model.state_dict()

    total_time = time.time() - start
    
    print('-' * 74)
    print('Training complete in {:.0f}m {:.0f}s'.format(total_time // 60, total_time % 60))
    print('Best validation RMSE: {:.4f}'.format(best_acc))

    # load best model weights and return them
    model.load_state_dict(best_model_wts)
    return model

In [28]:
model = train(model, criterion, optimizer, epochs, dataloaders, device)

| Epoch	 | Train Loss	| Train Acc	| Valid Loss	| Valid Acc	| Epoch Time |
--------------------------------------------------------------------------------------
| 1	 | 0.2026	| 0.4500	| 0.2445	| 0.4945	| 0m 17s     |
| 2	 | 0.1879	| 0.4335	| 0.2462	| 0.4962	| 0m 17s     |
| 3	 | 0.2067	| 0.4546	| 0.2447	| 0.4947	| 0m 17s     |
| 4	 | 0.1783	| 0.4223	| 0.2420	| 0.4919	| 0m 17s     |
| 5	 | 0.1861	| 0.4310	| 0.2420	| 0.4919	| 0m 17s     |
| 6	 | 0.1737	| 0.4167	| 0.2460	| 0.4960	| 0m 18s     |
| 7	 | 0.2037	| 0.4510	| 0.2485	| 0.4985	| 0m 21s     |
| 8	 | 0.2082	| 0.4554	| 0.2459	| 0.4959	| 0m 18s     |
| 9	 | 0.1703	| 0.4110	| 0.2452	| 0.4952	| 0m 17s     |
| 10	 | 0.1777	| 0.4192	| 0.2476	| 0.4976	| 0m 17s     |
--------------------------------------------------------------------------
Training complete in 2m 57s
Best validation RMSE: 0.4985


### Testing model

In [29]:
test = AvitoDataset(test_df, data_root, image_root, image_feature_extractor, torch_transforms, testing=True)

In [30]:
test_loader = DataLoader(test, batch_size=1, shuffle=False, num_workers = 1)

In [31]:
def test(model, dataloader, device):
    preds = []
    model.eval()

    for i, (inputs, labels) in enumerate(dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs[0].item())
    
    return preds

In [32]:
preds = test(model, test_loader, device)

In [None]:
subm = pd.read_csv(os.path.join(data_root, 'sample_submission.csv'))
subm['deal_probability'] = preds
subm.to_csv('submission.csv', index=False)