In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import datasets
from datasets import Dataset

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import nltk
from tqdm import tqdm

from collections import Counter
from typing import List, Callable
import string
import torch.nn.functional as nnf
import json

import seaborn
seaborn.set(palette='summer')

In [2]:
train_data = pd.read_csv('/kaggle/working/avito_dataset.csv')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [7]:
train_data['str_len']=train_data.description.str.len()
train_data.rename(columns={'Unnamed: 0': 'idxs'}, inplace=True)
train_data.rename(columns={'description': 'text', 'is_bad': 'label'}, inplace=True)

In [8]:
# data_dict = train_data[:-64000][['text', 'label']].to_dict('list')
# test_dict = train_data[-64000:][['text', 'label']].to_dict('list')
# train_d = Dataset.from_dict(data_dict)
# test_d = Dataset.from_dict(test_dict)

In [9]:
with open('/kaggle/input/d/emilte/dictionaries-avito/ind2word.json') as json_file:
    ind2word = json.load(json_file)

In [10]:
with open('/kaggle/input/d/emilte/dictionaries-avito/word2ind.json') as json_file:
    word2ind = json.load(json_file)

In [11]:
vocab = word2ind.keys()

In [14]:
words = Counter()

for example in tqdm(train_data['text']):
    # Приводим к нижнему регистру и убираем пунктуацию
    prccessed_text = example.lower().translate(
        str.maketrans('', '', string.punctuation))

    for word in word_tokenize(prccessed_text):
        words[word] += 1


vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
counter_threshold = 128

for char, cnt in words.items():
    if cnt > counter_threshold:
        vocab.add(char)

print(f'Размер словаря: {len(vocab)}')

word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

In [15]:
with open('word2ind.json', 'w') as fp:
    json.dump(word2ind, fp)

In [16]:
with open('ind2word.json', 'w') as fps:
    json.dump(ind2word, fps)

In [12]:
class WordDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        processed_text = self.data[idx]['text'].lower().translate(
            str.maketrans('', '', string.punctuation))
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [
            word2ind.get(word, self.unk_id) for word in word_tokenize(processed_text)
            ]
        tokenized_sentence += [self.eos_id]

        train_sample = {
            "text": tokenized_sentence,
            "label": self.data[idx]['label']
        }

        return train_sample

    def __len__(self) -> int:
        return len(self.data)


def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=word2ind['<pad>'], max_len=1024) -> torch.Tensor:
    seq_lens = [len(x['text']) for x in input_batch]
    max_seq_len = min(max(seq_lens), max_len)

    new_batch = []
    for sequence in input_batch:
        sequence['text'] = sequence['text'][:max_seq_len]
        for _ in range(max_seq_len - len(sequence['text'])):
            sequence['text'].append(pad_id)

        new_batch.append(sequence['text'])

    sequences = torch.LongTensor(new_batch).to(device)
    labels = torch.LongTensor([x['label'] for x in input_batch]).to(device)

    new_batch = {
        'input_ids': sequences,
        'label': labels
    }

    return new_batch

In [13]:
train_dataset = WordDataset(train_d)

idx = np.random.choice(np.arange(len(test_d)), 32000)
eval_dataset = WordDataset(test_d.select(idx))

pred_dataset = WordDataset(train_d)

idx2 = np.random.choice(np.arange(len(test_d)), 64)
debug_dataset = WordDataset(test_d.select(idx2))

batch_size = 32
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=collate_fn_with_padding, batch_size=batch_size)

eval_dataloader = DataLoader(
    eval_dataset, shuffle=False, collate_fn=collate_fn_with_padding, batch_size=batch_size)

pred_dataloader = DataLoader(
    pred_dataset, shuffle=False, collate_fn=collate_fn_with_padding, batch_size=batch_size)

debug_dataloader = DataLoader(
    debug_dataset, shuffle=False, collate_fn=collate_fn_with_padding, batch_size=batch_size)

In [14]:
def evaluate(model, eval_dataloader) -> float:
    """
    Calculate accuracy on validation dataloader.
    """

    predictions = []
    target = []
    with torch.no_grad():
        for batch in eval_dataloader:
            logits = model(batch['input_ids'])
            predictions.append(logits.argmax(dim=1))
            target.append(batch['label'])

    predictions = torch.cat(predictions)
    target = torch.cat(target)
    accuracy = (predictions == target).float().mean().item()
    
    return accuracy

In [15]:
def predict(model, eval_dataloader) -> float:
    """
    Calculate accuracy on validation dataloader.
    """

    predictions = []

    with torch.no_grad():
        for batch in eval_dataloader:
            logits = model(batch['input_ids'])
            predictions.append(logits.argmax(dim=1))

    predictions = torch.cat(predictions)
    
    
    return predictions

In [16]:
def predict_proba(model, pred_dataloader) -> List:
    """
    Calculate accuracy on validation dataloader.
    """

    predictions = []
    
    with torch.no_grad():
        for batch in pred_dataloader:
            logits = model(batch['input_ids'])
            probs = nnf.softmax(logits, dim=1)
            predictions.append(probs[:,1]) # BINARY - change for num_cat
    
    predictions = torch.cat(predictions)

    return predictions

In [17]:
def train(model: Callable,
          train_loader: DataLoader,
          eval_loader: DataLoader,
          num_epochs: int = 2,
          optimizer: torch.optim.Optimizer = None,
          criterion = None,
          scheduler: Callable = None,
          device: str = 'cuda'):

    eval_steps = len(train_dataloader) // 2

    # в качестве значений по умолчанию используются параметры из самого ноутбука
    if optimizer is None:
        optimizer = torch.optim.Adam(model.parameters())

    if criterion is None:
        criterion = nn.CrossEntropyLoss(ignore_index=word2ind['<pad>'])

    acc = []
    losses = []

    for epoch in range(num_epochs):
        epoch_losses = []
        model.train()

        for i, batch in enumerate(tqdm(train_loader, desc=f'Training epoch {epoch}:')):
            optimizer.zero_grad()
            logits = model(batch['input_ids'])
            loss = criterion(logits, batch['label'])
            loss.backward()
            optimizer.step()

            epoch_losses.append(loss.item())

            if i % eval_steps == 0:
                model.eval()
                acc.append(evaluate(model, eval_loader))
                model.train()

        # если у нас есть scheduler, то мы его применем
        if not scheduler is None:
            scheduler.step()

    losses.append(sum(epoch_losses) / len(epoch_losses))
    
    print ("Accuracy:", acc)
    print ("Losses:", losses)
    return acc, losses

In [18]:
class CharLM(nn.Module):
    def __init__(
        self, hidden_dim: int, vocab_size: int, num_classes: int = 2,
        aggregation_type: str = 'max'
        ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.rnn = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim*2, hidden_dim)
        self.projection = nn.Linear(hidden_dim, num_classes)

        self.non_lin = nn.Tanh()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.15)

        self.aggregation_type = aggregation_type

    def forward(self, input_batch) -> torch.Tensor:
        embeddings = self.embedding(input_batch)  # [batch_size, seq_len, hidden_dim]
        output, _ = self.rnn(embeddings)  # [batch_size, seq_len, hidden_dim]

        if self.aggregation_type == 'max':
            output = output.max(dim=1)[0] #[batch_size, hidden_dim]
        elif self.aggregation_type == 'mean':
            output = output.mean(dim=1) #[batch_size, hidden_dim]
        else:
            raise ValueError("Invalid aggregation_type")

        output = self.dropout(self.relu(self.linear(self.non_lin(output))))  # [batch_size, hidden_dim]
        prediction = self.projection(self.non_lin(output))  # [batch_size, num_classes]

        return prediction

In [None]:
model = CharLM(hidden_dim=256, vocab_size=len(vocab), aggregation_type = 'mean').to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

criterion = nn.CrossEntropyLoss(ignore_index=word2ind['<pad>'])

In [None]:
train(model=model,
      train_loader=train_dataloader,
      eval_loader=eval_dataloader,
      num_epochs=2,
      optimizer=optimizer,
      criterion=criterion,
      device=device)

In [None]:
model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('cpu2_model_scripted.pt')

In [None]:
torch.save(model.state_dict(), 'cpu2_model.pt')

In [19]:
device = torch.device('cpu')
model2 = CharLM(hidden_dim=256, vocab_size=len(vocab), aggregation_type = 'mean').to(device)
model2.load_state_dict(torch.load('/kaggle/input/lstm-avito/pytorch/mb_working/1/cpu2_model.pt', map_location=device))

<All keys matched successfully>

In [26]:
evaluate(model2, debug_dataloader)

0.9375

In [25]:
probs = predict_proba(model2, pred_dataloader)

In [31]:
dataset = train_data[:-64000].copy()

In [32]:
dataset['probs'] = probs 

In [None]:
dataset['phones_loc'] = [[] for _ in range(len(dataset))]
for i in range(len(dataset['text'])):
    text = dataset.iloc[i, 2]  
    for m in re.finditer(r'(?:8|\+|9)[\- ]?(?:\(?\d{3}\)?[\- ]?)?[\d\- ]{7,10}', text):
        dataset.iloc[i, 9].append([m.start(), m.end()])
        
        
dataset['link_mail_loc'] = [[] for _ in range(len(dataset))]
for i in range(len(dataset['text'])):
    text = dataset.iloc[i, 2]  
    for m in re.finditer(r'(https?://)?\s*([a-zA-Z0-9-]+\. (ru|com|org|me))\s*/\s*([a-zA-Z0-9/]*)', text):
        dataset.iloc[i, 10].append([m.start(), m.end()])
        

dataset['nick_loc'] = [[] for _ in range(len(dataset))] 
for i in range(len(dataset['text'])):
    text = dataset.iloc[i, 2]  
    for m in re.finditer(r'@[\w]+', text):
        dataset.iloc[i, 13].append([m.start(), m.end()])
        
dataset['link_mail_count'] = [len(i) for i in dataset['link_mail_loc']]
dataset['phone_count'] = [len(i) for i in dataset['phones_loc']]
dataset['nick_count'] = [len(i) for i in dataset['nick_loc']]

for i in range(len(dataset)):
    if dataset.iloc[i, dataset.columns.get_loc('phone_count')] > 0:
        dataset.iloc[i, dataset.columns.get_loc('start')] = dataset.iloc[i, dataset.columns.get_loc('phones_loc')][0][0]
        dataset.iloc[i, dataset.columns.get_loc('end')] = dataset.iloc[i, dataset.columns.get_loc('phones_loc')][0][1]
        
    elif dataset.iloc[i, dataset.columns.get_loc('nick_count')] > 0:
        dataset.iloc[i, dataset.columns.get_loc('start')] = dataset.iloc[i, dataset.columns.get_loc('nick_loc')][0][0]
        dataset.iloc[i, dataset.columns.get_loc('end')] = dataset.iloc[i, dataset.columns.get_loc('nick_loc')][0][1]
        
    elif dataset.iloc[i, dataset.columns.get_loc('link_mail_count')] > 0:
        dataset.iloc[i, dataset.columns.get_loc('start')] = dataset.iloc[i, dataset.columns.get_loc('link_mail_loc')][0][0]
        dataset.iloc[i, dataset.columns.get_loc('end')] = dataset.iloc[i, dataset.columns.get_loc('link_mail_loc')][0][1]

In [37]:
dataset.to_csv('/kaggle/working/avito_dataset.csv')

In [40]:
import os
os.chdir(r'../working')
from IPython.display import FileLink
FileLink(r'avito_dataset.csv')