In [1]:
import pandas as pd

In [2]:
# Загружаем данные
df_train = pd.read_csv('data/train.csv', index_col=0)
df_prediction = pd.read_csv('data/test.csv', index_col=0)

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41159 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       41158 non-null  object
 1   Sentiment  41155 non-null  object
dtypes: object(2)
memory usage: 964.7+ KB


In [4]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3798 entries, 787bc85b-20d4-46d8-84a0-562a2527f684 to 8d09ea68-a130-4f3a-8777-f821b354542d
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    3798 non-null   object
dtypes: object(1)
memory usage: 59.3+ KB


In [5]:
# Удаляем строки с пропущенными метками классов
df_train.dropna(inplace=True)

In [6]:
df_train = df_train.rename(columns={'Sentiment': 'Y', 'Text': 'X'})
df_prediction = df_prediction.rename(columns={'Text': 'X'})

In [7]:
df_train.head()

Unnamed: 0,X,Y
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
# посмотрим балланс классов
df_train.Y.value_counts()

Y
Positive              11422
Negative               9917
Neutral                7711
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

Пока только отметим что выборку нельзя назвать сбалансированной (разница более чем 2 раза). Потом можно попробовать сбалансировать ее для улучшения результата, но для начала возьмем как есть

In [9]:
# кодируем признки
# для начала попробуем Lable Encoding
df_train.Y.unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [10]:
cls_map = {'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive' :4}
target_classes = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
df_train.Y.replace(cls_map, inplace=True)

In [11]:
# Токенезируем тексты
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_train, test_size=0.2, random_state=42)

TOKENIZER = get_tokenizer('basic_english')

def build_vocabulary(datasets):
    for dataset in datasets:
        for index, row in dataset.iterrows():
            yield (TOKENIZER(row['X']))

# отправляем в токенизатор также df_prediction на тот случай, если там есть слова, которые не встречались в df_train
# хотя, в этом случае для них все равно не получится определить ebeddings

vocab = build_vocab_from_iterator(build_vocabulary([train_df, test_df, df_prediction]), min_freq=1, specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# проверим что получилось
tokens = TOKENIZER('some text for test of tokenezation')
indexes = vocab(tokens)
print(indexes, tokens)

[93, 2135, 13, 606, 9, 0] ['some', 'text', 'for', 'test', 'of', 'tokenezation']


In [13]:
from torch.utils.data import Dataset, DataLoader
import torch


class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]
        features = sample['X']  # Assuming 'X' is your features column
        target = sample['Y']  # Assuming 'Y' is your target column
        return  target, features

train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

max_words = 40

def vectorize_batch(batch):
    Y, X = list(zip(*batch))
    X = [vocab(TOKENIZER(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.
    return torch.tensor(X, dtype=torch.int32, device='cpu'), torch.tensor(Y, device='cpu') 

train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)

In [14]:
# for x,y in train_dataset:
#     print(x)
#     print(y)
#     break

In [15]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

torch.Size([1024, 40]) torch.Size([1024])


## Define RNN Classification Network

In this section, we have created a neural network that we'll be using for the text classification task. The network consists of 3 layers.

Embeddings layer

RNN layer

Linear layer

In [16]:
from torch import nn
from torch.nn import functional as F

embed_len = 50
hidden_dim = 50
n_layers=1

class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.rnn = nn.RNN(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, len(target_classes))

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings, torch.randn(n_layers, len(X_batch), hidden_dim))
        return self.linear(output[:,-1])

In [17]:
rnn_classifier = RNNClassifier()
rnn_classifier

RNNClassifier(
  (embedding_layer): Embedding(97236, 50)
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=5, bias=True)
)

In [18]:
for layer in rnn_classifier.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

Layer : Embedding(97236, 50)
Parameters : 
torch.Size([97236, 50])

Layer : RNN(50, 50, batch_first=True)
Parameters : 
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])

Layer : Linear(in_features=50, out_features=5, bias=True)
Parameters : 
torch.Size([5, 50])
torch.Size([5])



In [19]:
out = rnn_classifier(torch.randint(0, len(vocab), (1024, max_words)))
out.shape

torch.Size([1024, 5])

## Train Network

In [20]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [22]:
from torch.optim import Adam

epochs = 30
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
rnn_classifier = RNNClassifier()
optimizer = Adam(rnn_classifier.parameters(), lr=learning_rate)

TrainModel(rnn_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|██████████| 33/33 [00:04<00:00,  7.10it/s]


Train Loss : 1.584
Valid Loss : 1.568
Valid Acc  : 0.269


100%|██████████| 33/33 [00:04<00:00,  7.32it/s]


Train Loss : 1.548
Valid Loss : 1.541
Valid Acc  : 0.274


100%|██████████| 33/33 [00:04<00:00,  7.33it/s]


Train Loss : 1.531
Valid Loss : 1.541
Valid Acc  : 0.294


100%|██████████| 33/33 [00:04<00:00,  7.40it/s]


Train Loss : 1.523
Valid Loss : 1.534
Valid Acc  : 0.293


100%|██████████| 33/33 [00:04<00:00,  7.54it/s]


Train Loss : 1.524
Valid Loss : 1.543
Valid Acc  : 0.292


100%|██████████| 33/33 [00:04<00:00,  7.41it/s]


Train Loss : 1.508
Valid Loss : 1.533
Valid Acc  : 0.297


100%|██████████| 33/33 [00:04<00:00,  7.45it/s]


Train Loss : 1.502
Valid Loss : 1.514
Valid Acc  : 0.301


100%|██████████| 33/33 [00:04<00:00,  7.43it/s]


Train Loss : 1.488
Valid Loss : 1.522
Valid Acc  : 0.299


100%|██████████| 33/33 [00:04<00:00,  7.21it/s]


Train Loss : 1.471
Valid Loss : 1.528
Valid Acc  : 0.307


100%|██████████| 33/33 [00:04<00:00,  7.40it/s]


Train Loss : 1.451
Valid Loss : 1.516
Valid Acc  : 0.308


100%|██████████| 33/33 [00:04<00:00,  7.44it/s]


Train Loss : 1.418
Valid Loss : 1.507
Valid Acc  : 0.310


100%|██████████| 33/33 [00:04<00:00,  7.50it/s]


Train Loss : 1.384
Valid Loss : 1.518
Valid Acc  : 0.308


100%|██████████| 33/33 [00:04<00:00,  7.37it/s]


Train Loss : 1.360
Valid Loss : 1.512
Valid Acc  : 0.327


100%|██████████| 33/33 [00:04<00:00,  7.45it/s]


Train Loss : 1.324
Valid Loss : 1.517
Valid Acc  : 0.325


100%|██████████| 33/33 [00:04<00:00,  7.32it/s]


Train Loss : 1.283
Valid Loss : 1.593
Valid Acc  : 0.319


100%|██████████| 33/33 [00:04<00:00,  7.54it/s]


Train Loss : 1.267
Valid Loss : 1.563
Valid Acc  : 0.313


100%|██████████| 33/33 [00:04<00:00,  7.56it/s]


Train Loss : 1.224
Valid Loss : 1.558
Valid Acc  : 0.330


100%|██████████| 33/33 [00:04<00:00,  7.30it/s]


Train Loss : 1.198
Valid Loss : 1.617
Valid Acc  : 0.332


100%|██████████| 33/33 [00:04<00:00,  7.59it/s]


Train Loss : 1.176
Valid Loss : 1.541
Valid Acc  : 0.326


100%|██████████| 33/33 [00:04<00:00,  7.53it/s]


Train Loss : 1.137
Valid Loss : 1.598
Valid Acc  : 0.316


100%|██████████| 33/33 [00:04<00:00,  7.48it/s]


Train Loss : 1.115
Valid Loss : 1.597
Valid Acc  : 0.337


100%|██████████| 33/33 [00:04<00:00,  7.45it/s]


Train Loss : 1.084
Valid Loss : 1.635
Valid Acc  : 0.336


100%|██████████| 33/33 [00:04<00:00,  7.51it/s]


Train Loss : 1.046
Valid Loss : 1.748
Valid Acc  : 0.322


100%|██████████| 33/33 [00:04<00:00,  7.47it/s]


Train Loss : 1.030
Valid Loss : 1.657
Valid Acc  : 0.333


100%|██████████| 33/33 [00:04<00:00,  7.51it/s]


Train Loss : 0.999
Valid Loss : 1.740
Valid Acc  : 0.325


100%|██████████| 33/33 [00:04<00:00,  7.40it/s]


Train Loss : 0.982
Valid Loss : 1.713
Valid Acc  : 0.326


100%|██████████| 33/33 [00:04<00:00,  7.62it/s]


Train Loss : 0.956
Valid Loss : 1.710
Valid Acc  : 0.337


100%|██████████| 33/33 [00:04<00:00,  7.39it/s]


Train Loss : 0.936
Valid Loss : 1.769
Valid Acc  : 0.340


100%|██████████| 33/33 [00:04<00:00,  7.58it/s]


Train Loss : 0.906
Valid Loss : 1.816
Valid Acc  : 0.336


100%|██████████| 33/33 [00:04<00:00,  7.46it/s]


Train Loss : 0.908
Valid Loss : 1.740
Valid Acc  : 0.322
