In [None]:
import pandas as pd
import torchtext as tt
import torch
import numpy as np
from sklearn import feature_extraction, linear_model
from sklearn.model_selection import KFold, train_test_split

np.set_printoptions(threshold=np.inf)

# Data read

In [None]:
trainDf = pd.read_csv("./train.csv")
submission = pd.read_csv("./test.csv")

print(len(trainDf), len(submission))
print(trainDf.head(5))
print(submission.head(5))

# Data exploration

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(trainDf["keyword"].value_counts())

In [None]:
trainDf[trainDf["target"] == 0]["text"].values[0:3]

# Tweets Vectorization, data preparation

In [None]:
trainDf = trainDf[["text", "target"]]
trainDf.head(5)

In [None]:
vectorizer = feature_extraction.text.CountVectorizer()
vectorized_example = vectorizer.fit_transform(trainDf["text"][0:5])
print(vectorized_example.todense())

In [None]:
train_vectorizer = feature_extraction.text.CountVectorizer()
train_part, test_part = train_test_split(trainDf, test_size=0.2)

print(train_part.shape, test_part.shape)

vectorized_train = train_vectorizer.fit_transform(train_part["text"])
vectorized_test = train_vectorizer.transform(test_part["text"])

In [None]:
print(vectorized_train.shape)
print(vectorized_test.shape)

In [None]:
vectorized_train_labels = []
for i in train_part["target"]:
    lbl = [0,0]
    lbl[i] = 1
    vectorized_train_labels.append(lbl)
vectorized_test_labels = []
for i in test_part["target"]:
    lbl = [0,0]
    lbl[i] = 1
    vectorized_test_labels.append(lbl)

vectorized_train_labels

In [None]:
len(vectorized_train_labels)

In [None]:
class tweetDataset(torch.utils.data.Dataset):

    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y
        

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return len(self.x)

# LSTM model

In [None]:
class MyLSTM(torch.nn.Module):

    def __init__(self, embedding_dim, hidden_size, vocab_size):
        super().__init__()
        self.embedding = torch.nn.Linear(vocab_size, embedding_dim)
        self.encoder = torch.nn.LSTM(input_size=1, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.predictor = torch.nn.Linear(hidden_size, 2)
    
    def forward(self, index_sequence):
        emb = self.embedding(index_sequence)
        #print(f"Emb output shape 1: {emb.shape}")
        emb = torch.unsqueeze(emb, 2)
        #print(f"Emb output shape 2: {emb.shape}")
        output, (hidden_state, cell_state) = self.encoder(emb)
        #print(f"lstm output shape 1: {output.shape}")
        output = torch.transpose(output, 0,1)
        #print(f"lstm output shape 2: {output.shape}")
        final = self.predictor(output[-1])
        #print(f"Final output shape {final.shape}")
        return final 

device = "cuda"

# Cross validation train

In [None]:
def train_model(epochs, model, optimizer, lossFn, trainLoader, validationLoader):
    global batch_size
    for e in range(epochs):
        train_loss = 0
        validation_loss = 0
        model.train()
        for batch in trainLoader:
            tweets, labels = batch
            labels = torch.tensor(labels).to(torch.float32).to(device)
            tweets = torch.tensor(tweets).to(torch.float32).to(device)
            optimizer.zero_grad()
            predict = model(tweets)
            # print(tweets)
            # print(labels)
            #print(predict[0], predict[0])
            loss = lossFn(predict, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data.item()
        train_loss /= len(trainLoader)

        model.eval()
        for batch in validationLoader:
            tweets, labels = batch
            labels = torch.tensor(labels).to(torch.float32).to(device)
            tweets = tweets.to(device)
            prediction = model(tweets)
            loss = lossFn(prediction, labels)
            validation_loss += loss.data.item()
        validation_loss /= len(validationLoader)
        print(f"Epoch: {e}, Train Loss: {round(train_loss,2)}, Validation Loss: {round(validation_loss,2)}")
    
    return validation_loss

In [None]:
def collate_convert(batch):
    tweets, labels = zip(*batch)
    return (torch.Tensor(tweets), torch.Tensor(labels))

In [None]:
batch_size = 64

testDataset = tweetDataset(vectorized_test, vectorized_test_labels)
testLoader =  torch.utils.data.DataLoader(testDataset, collate_fn=collate_convert, batch_size=batch_size)

In [119]:
#Don't know why, but it does not change train indexes, so I will take them from validation indexes

#model = MyLSTM(1000, 200, vectorized_train.shape[1])
model = torch.load("twits_classify_LSTM_model.pth")
model.to(device)

n_splits=10
crossval_selection = KFold(n_splits=n_splits)
generator = crossval_selection.split(vectorized_train)
for _ in range(2):
    train, validation = next(generator)
    print(validation[0], validation[-1])
    vtarr = vectorized_train.toarray()
    trainDataset = tweetDataset(np.concatenate((vtarr[0:validation[0]+1], vtarr[validation[-1]:])), np.concatenate((vectorized_train_labels[0:validation[0]+1], vectorized_train_labels[validation[-1]:])))
    trainLoader = torch.utils.data.DataLoader(trainDataset)

    validationDataset = tweetDataset(vtarr[validation[0]:validation[-1]], vectorized_train_labels[validation[0]:validation[-1]])
    validationLoader = torch.utils.data.DataLoader(validationDataset, collate_fn=collate_convert, batch_size=batch_size)

    
    epochs = 1
    optimizer = torch.optim.ASGD(model.parameters(), lr=0.1)
    lossFn = torch.nn.CrossEntropyLoss()
    valLoss = train_model(epochs, model, optimizer, lossFn, trainLoader, validationLoader)
    print("saved")
    torch.save(model, "twits_classify_LSTM_model.pth")
        


0 608


  labels = torch.tensor(labels).to(torch.float32).to(device)
  tweets = torch.tensor(tweets).to(torch.float32).to(device)
  labels = torch.tensor(labels).to(torch.float32).to(device)


Epoch: 0, Train Loss: 0.09, Validation Loss: 0.23
saved
609 1217
Epoch: 0, Train Loss: 0.09, Validation Loss: 0.24
saved


In [115]:
mapping = {1:"disaster", 0:"false alarm"}

result = pd.DataFrame(submission["id"])
target = pd.Series()

model.eval()
for i in range(0,len(submission),100):
    output = model(torch.tensor(train_vectorizer.transform(submission["text"][i:i+100]).toarray()).to(torch.float32).to(device))
    indexes = torch.argmax(output, dim=1).to("cpu")
    target = pd.concat([target, pd.Series(indexes)])

result["target"] = target.reset_index(drop=True)


In [117]:
result.to_csv("submission.csv", index=None)