In [1]:
import spacy
spacy_en = spacy.load('en')
import torchtext
from torchtext import data
import pandas as pd
import csv
import transformers
import torch

### Pre-processing

In [22]:
PATH = "../twitter-datasets/"
FILE = "dataset.csv"

In [23]:
# Load the TXT file of tweets
f = open(PATH + "train_neg.txt")
tweets_neg = f.readlines()
f.close()

f = open(PATH + "train_pos.txt")
tweets_pos = f.readlines()
f.close()

In [39]:
# Convert list to DataFrame with 'label' column
tweets_pos = pd.DataFrame(tweets_pos, columns=['tweets'])
tweets_pos['label'] = 'pos'
tweets_neg = pd.DataFrame(tweets_neg, columns=['tweets'])
tweets_neg['label'] = 'neg'

In [40]:
tweets_pos

Unnamed: 0,tweets,label
0,<user> i dunno justin read my mention or not ....,pos
1,"because your logic is so dumb , i won't even c...",pos
2,""" <user> just put casper in a box ! "" looved t...",pos
3,<user> <user> thanks sir > > don't trip lil ma...,pos
4,visiting my brother tmr is the bestest birthda...,pos
...,...,...
99995,<user> hey gina what's up ?\n,pos
99996,"<user> sas 9.1 . 3 and 9.2 , east 5 , s-plus 8...",pos
99997,<user> <user> um gord ... i just read your pro...,pos
99998,<user> i'm so excited for tomorrow ! look out ...,pos


In [41]:
tweets_neg

Unnamed: 0,tweets,label
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,neg
1,glad i dot have taks tomorrow ! ! #thankful #s...,neg
2,1-3 vs celtics in the regular season = were fu...,neg
3,<user> i could actually kill that girl i'm so ...,neg
4,<user> <user> <user> i find that very hard to ...,neg
...,...,...
99995,can't wait to fake tan tonight ! hate being pa...,neg
99996,<user> darling i lost my internet connection ....,neg
99997,kanguru defender basic 4 gb usb 2.0 flash driv...,neg
99998,rizan is sad now\n,neg


In [42]:
tweet_dataset = tweets_pos.append(tweets_neg)

In [43]:
tweet_dataset = tweet_dataset.sample(frac=1).reset_index(drop=True)

In [44]:
tweet_dataset.to_csv(PATH+FILE, index=False)

In [45]:
# Define Field
TEXT = data.Field(tokenize = 'spacy')

LABEL = data.LabelField(dtype = torch.float)

fields = [('tweet', TEXT), ('label', LABEL)]

# Instantiation
tweets_train = data.TabularDataset(path=PATH+FILE, format="CSV", fields=fields, skip_header=True)

### Build simple vocabulary

In [49]:
# Build the vocabulary, only keeping the most common max_size tokens.

MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(tweets_train, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(tweets_train)

In [50]:
# NUMBER OF TOKENS
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [51]:
# MOST COMMON WORDS IN VOCABULARY
TEXT.vocab.freqs.most_common(20)

[('>', 175202),
 ('<', 173908),
 ('user', 128022),
 ('i', 106018),
 ('!', 83074),
 ('the', 60534),
 ('.', 59814),
 (',', 59692),
 ('to', 56051),
 ('you', 52254),
 ('(', 47125),
 ('url', 43642),
 ('a', 42759),
 ('...', 40969),
 ('and', 36160),
 ('my', 31871),
 ('#', 29332),
 ('-', 29030),
 ('it', 28098),
 ('me', 27570)]

In [52]:
# VISUALIZE VOCABULARY
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '>', '<', 'user', 'i', '!', 'the', '.', ',']


In [54]:
# CHECK LABELS
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


### Build upgraded vocabulary

In [61]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [None]:
TEXT = data.Field(tokenize='spacy', preprocessing = generate_bigrams)

# Initialize those words via a Gaussian distribution.
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(tweets_train, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)
                 

LABEL.build_vocab(tweets_train)