In [15]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

import torch
import re
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('train.csv', index_col=0)
df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    texter = re.sub(r'[^\w\s]', '', texter)
    if texter == "":
        texter = ""
    return texter

In [8]:
glove_dir = './glove.6B.300d.txt'
input_dim = 300

vocab = {}
with open(glove_dir, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        vocab[word] = vector
f.close()

print('Found %s word vectors.' %len(vocab))

Found 400000 word vectors.


In [9]:
max_len = 50
zero_padding = [0]*input_dim

def get_embeddings(text, emb="LSTM"):
    if emb == "DNN":
        embedding = [0]*input_dim
        i = 0
        for word in text.split(' '):
            if word in vocab:
                i += 1
                embedding += vocab[word]

        if i != 0:
            embedding /= i

#         scale = 0 # noise weight_scale
#         noise = np.random.randn(input_dim)*scale
#         embedding += noise
    elif emb == "LSTM":
        embedding = []
        i = 0
        for word in text.split(' '):
            if i == max_len:
                break
            if word in vocab:
                i += 1
                embedding.append(vocab[word])
        
        while i < max_len:
            i += 1
            embedding.append(zero_padding)
            
    return embedding

In [10]:
def transform(X, emb="LSTM"):
    embeddings = []
    for item in X:
        item = clean(item)
        embedding = get_embeddings(item, emb)
        embeddings.append(embedding)
    
    return embeddings

In [21]:
## Create embeddings for input
cleaned_X = np.array(transform(df['text'], "DNN"))
y = np.array(df['target'])

In [None]:
## Sentence BERT

# sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
# sentence_embeddings = sbert_model.encode(cleaned_x, show_progress_bar=True)

In [12]:
# Hyperparameters
num_epochs = 10
batch_size = 256
learning_rate = 0.0001
dropout = 0.25

In [22]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_X, y, test_size=0.15, random_state=42)

X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(300, 500)
        self.hidden1 = nn.Linear(500, 500)
        self.hidden2 = nn.Linear(500, 500)
        self.fc2 = nn.Linear(500, 20)
        self.dropout = nn.Dropout(0.25)
        
        self.batchnorm1 = nn.BatchNorm1d(500)
        self.batchnorm2 = nn.BatchNorm1d(500)
        self.batchnorm3 = nn.BatchNorm1d(500)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = F.relu(self.hidden1(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = F.relu(self.hidden2(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
#         x = self.fc2(x)
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

net = Net()