## SentenceBERT Notebook

In [None]:
import pandas as pd
train_df = pd.read_csv('../input/disaster/train_pp.csv')
test_df = pd.read_csv('../input/disaster/test_pp.csv')

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = list(train_df['text']) + list(test_df['text'])

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [None]:
train_X = sentence_embeddings[:len(train_df)]
test_X = sentence_embeddings[len(train_df):]
y = np.array(train_df['label'])

In [None]:
batch_size = 128
dropout = 0.25

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_X, y, test_size=0.1, random_state=42)

X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [None]:
class SentenceNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(384, 500)
        self.hidden1 = nn.Linear(500, 500)
        self.hidden2 = nn.Linear(500, 500)
        self.fc2 = nn.Linear(500, 2)
        self.dropout = nn.Dropout(0.25)
        
        self.batchnorm1 = nn.BatchNorm1d(500)
        self.batchnorm2 = nn.BatchNorm1d(500)
        self.batchnorm3 = nn.BatchNorm1d(500)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = F.relu(self.hidden1(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = F.relu(self.hidden2(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

net = SentenceNet()

In [None]:
learning_rate = 0.00001

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [None]:
num_epochs = 50

In [None]:
training_loss = []
for epoch in range(num_epochs):
    net.train()
    for i, data in enumerate(trainloader):
        inputs, labels = data

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        training_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 25 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(trainloader), loss.data))
    
    net.eval()
    outputs = net(X_te)

    _, predicted = torch.max(outputs, 1)

    total = y_te.size(0)
    correct = (predicted == y_te).sum()

    print(f'Accuracy of the model is: {100*correct/total:.2f}%')

In [None]:
# Testing
net.eval()
outputs_test = net(test_X)

_, predicted_test = torch.max(outputs_test, 1)

In [None]:
data = {'id': np.array(test_df['id']),
       'target': np.array(predicted_test)}

In [None]:
df_submission = pd.DataFrame(data)
df_submission.to_csv('submission_sen_emb.csv', encoding='utf-8', index=False)