In [None]:
import sys
import os
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import pandas as pd
import numpy as np

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# FUNCTIONS

#defining the function to remove punctuation
def clean_text(text):
    punctuationfree="".join([i.lower() for i in text if i not in string.punctuation])
    return punctuationfree

import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

# creates word2vec model and returns list of word-vector embeddings
# sentences should be a list of lists of tokenized
def load_word_vectors(sentences):
    model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv
    return word_vectors

def build_vocab(corpus):
    word_count = {}
    for sentence in corpus:
        for token in sentence:
            if token not in word_count:
                word_count[token] = 1
            else:
                word_count[token] += 1
    return word_count

def word2index(vocab):
    word_index = {w: i for i, w in enumerate(vocab)}
    idx_word = {i: w for i, w in enumerate(vocab)}
    return word_index, idx_word

In [None]:
class SentencePairs(Dataset):
    
    def __init__(self, file, transform=None):
        self.data = pd.read_pickle(file)

        s1_tokens = self.data['s1_tokens']
        s2_tokens = self.data['s2_tokens']
        corpus = s1_tokens + s2_tokens

        self.transform = transform
        self.vocab = self.build_vocab(corpus)
        self.word2idx, self.idx2word = self.word2index(self.vocab)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # get input sentences
        tokens1 = self.data['s1_tokens'][idx]
        tokens2 = self.data['s2_tokens'][idx]
        # convert words to index

        input1 = [self.word2idx[word] for word in tokens1]
        input2 = [self.word2idx[word] for word in tokens2]
        # get label
        label = self.data['bin_label'][idx]

        # build sample
        sample = {"sentence1": input1, "sentence2": input2, "label": label}
        return sample

    def build_vocab(self, corpus):
        word_count = {}
        for sentence in corpus:

            for token in sentence:
                # print(token)
                if token not in word_count:
                    word_count[token] = 1
                else:
                    word_count[token] += 1
        return word_count

    def word2index(self, vocab):
        word_index = {w: i for i, w in enumerate(vocab)}
        idx_word = {i: w for i, w in enumerate(vocab)}
        return word_index, idx_word

In [None]:
mydataset = SentencePairs(file = 'twitter_corpus_clean.pkl')
mydataset[0]
sample1 = mydataset[0]['sentence1']

{'label': 0,
 'sentence1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 13],
 'sentence2': [14, 15, 16, 17, 0, 2, 18, 19, 20, 21, 22, 23]}

In [None]:
corpus_dir = "/content/drive/MyDrive/Corpus"
twitter_train = os.path.join(corpus_dir, "Twitter_URL_Corpus_train.txt")

# read as tsv
data = pd.read_csv(twitter_train, sep = "\t", names = ['s1', 's2', 'label', 'url'])

In [None]:
# clean text
data['s1_clean'] = data['s1'].apply(lambda x: clean_text(x))
data['s2_clean'] = data['s2'].apply(lambda x: clean_text(x))

# tokenize
data['s1_tokens'] = data['s1_clean'].apply(lambda x: x.split())
data['s2_tokens'] = data['s2_clean'].apply(lambda x: x.split())

# create binary labels, ignoring 3,6 since these are inconclusive
data = data[data.label != "(3,6)"]
data['bin_label'] = data['label'].apply(lambda x: 0 if x in ['(0,6)', '(1,6)', '(2,6)'] else 1)

# save to csv
data.to_pickle('twitter_corpus_clean.pkl')

# TODO: clean/save test data

In [None]:
# TODO: 
# Load csv
data = pd.read_csv('twitter_corpus_clean.csv')
# Create vectorized data
# Create data loader

# compile list of all tokenized sentences
s1_tokens = data['s1_tokens'].values.tolist()
s2_tokens = data['s2_tokens'].values.tolist()
sentence_lists = s1_tokens + s2_tokens

# create word_vectors list
# vectorized_word = word_vectors[word], can also use list of words
word_vectors = load_word_vectors(sentence_lists)



Approaches  
    - sentence embeddings via pretrained BERT model  
    - word embeddings -> sentence matrix as input