### Imports, preparation

(Used the trick in https://github.com/googlecolab/colabtools/issues/253#issuecomment-648634717 to obtain more RAM in google colab)

In [1]:
!pip install -q torchtext==0.6.0
!pip install wordsegment



In [2]:
from google.colab import drive
drive.mount('/content/drive')

ROOT_PATH = '/content/drive/My Drive/cil'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
import os

import pandas as pd
import numpy as np

import pickle
import datetime
# from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()

In [9]:
from torchtext.vocab import GloVe
glove = GloVe(name="twitter.27B", dim="200", cache=os.path.join(ROOT_PATH, "CIL-aux-data"))

### Do the preprocessing

In [4]:
RAW_TWITTER_DATASETS_DIR = os.path.join(ROOT_PATH, "twitter-datasets")
PREPROCESSED_TWITTER_DATASETS_DIR = os.path.join(ROOT_PATH, "stanford_glove_preprocessed")
# create output dir if it doesn't exist yet
os.makedirs(PREPROCESSED_TWITTER_DATASETS_DIR, exist_ok=True)

In [5]:
## convert raw to preprocessed
# alternatively use `run_script.sh` to run locally.
# --> requires Ruby support. (doesn't work on Colab, for example.)

if os.system("ruby --version") != 0: # exit code of the command
    print("no ruby support.")
else:
    ruby_script_path = os.path.join(ROOT_PATH, "preprocess-twitter.rb")
    assert os.path.isfile(ruby_script_path), f"ruby script not found at {ruby_script_path}"
    for filename in ["test_data", "train_pos", "train_neg", "train_pos_full", "train_neg_full"]:
        input_file = os.path.join(RAW_TWITTER_DATASETS_DIR, f"{filename}.txt")
        output_file = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, f"{filename}.txt")
        os.system(f"ruby -n {ruby_script_path} < {input_file} > {output_file}")

no ruby support.


In [6]:
## convert preprocessed txt files to readily-usable csv files

import re
## test set
with open(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_data.txt")) as f:
    tweets_test = [tweet.rstrip() for tweet in f]
with open(os.path.join(RAW_TWITTER_DATASETS_DIR, "test_data.txt")) as raw_f:
    raw_tweets_test = [tweet.rstrip() for tweet in raw_f]
    raw_tweets_test = [re.sub(r"^\d+,", "", tweet) for tweet in raw_tweets_test] # (for the test set only,) must remove index at beginning of line
tweets_test = zip(raw_tweets_test, tweets_test)
tweets_test = pd.DataFrame(tweets_test, columns=['raw_tweet', 'preprocessed_tweet'])
tweets_test.to_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove.csv"), index=False)

## limited train dataset
with open(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "train_pos.txt")) as f:
    tweets_train_pos = [tweet.rstrip() for tweet in f]
with open(os.path.join(RAW_TWITTER_DATASETS_DIR, "train_pos.txt")) as raw_f:
    raw_tweets_train_pos = [tweet.rstrip() for tweet in raw_f]
tweets_train_pos = zip(raw_tweets_train_pos, tweets_train_pos, [1]*len(tweets_train_pos))
tweets_train_pos = pd.DataFrame(tweets_train_pos, columns=['raw_tweet', 'preprocessed_tweet', 'label'])

with open(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "train_neg.txt")) as f:
    tweets_train_neg = [tweet.rstrip() for tweet in f]
with open(os.path.join(RAW_TWITTER_DATASETS_DIR, "train_neg.txt")) as raw_f:
    raw_tweets_train_neg = [tweet.rstrip() for tweet in raw_f]
tweets_train_neg = zip(raw_tweets_train_neg, tweets_train_neg, [0]*len(tweets_train_neg))
tweets_train_neg = pd.DataFrame(tweets_train_neg, columns=['raw_tweet', 'preprocessed_tweet', 'label'])

X_train_txt = pd.concat( (tweets_train_pos, tweets_train_neg), copy=False)
# X_train_txt = X_train_txt.sample(frac=1).reset_index(drop=True) # TODO: add this to shuffle the file lines
X_train_txt.to_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_limited.csv"),
                   index=False)

## full train dataset
with open(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "train_pos_full.txt")) as f:
    tweets_train_pos = [tweet.rstrip() for tweet in f]
with open(os.path.join(RAW_TWITTER_DATASETS_DIR, "train_pos_full.txt")) as raw_f:
    raw_tweets_train_pos = [tweet.rstrip() for tweet in raw_f]
tweets_train_pos = zip(raw_tweets_train_pos, tweets_train_pos, [1]*len(tweets_train_pos))
tweets_train_pos = pd.DataFrame(tweets_train_pos, columns=['raw_tweet', 'preprocessed_tweet', 'label'])

with open(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "train_neg_full.txt")) as f:
    tweets_train_neg = [tweet.rstrip() for tweet in f]
with open(os.path.join(RAW_TWITTER_DATASETS_DIR, "train_neg_full.txt")) as raw_f:
    raw_tweets_train_neg = [tweet.rstrip() for tweet in raw_f]
tweets_train_neg = zip(raw_tweets_train_neg, tweets_train_neg, [0]*len(tweets_train_neg))
tweets_train_neg = pd.DataFrame(tweets_train_neg, columns=['raw_tweet', 'preprocessed_tweet', 'label'])

X_train_txt = pd.concat( (tweets_train_pos, tweets_train_neg), copy=False)
# X_train_txt = X_train_txt.sample(frac=1).reset_index(drop=True) # TODO: add this to shuffle the file lines
X_train_txt.to_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_full.csv"),
                   index=False)

In [7]:
## before feeding to the glove word-embedder, it might be useful to segment the words, as there are many nonsegmented hashtags.
import wordsegment
wordsegment.load()

In [10]:
def segment_if_needed(tweet, verbose=False):
    words = tweet.lower().split()
    if len(words) == 0: # avoid program crashing...
        words = ["empty", "tweet"]
    # only segment words that are unrecognized by `glove`, otherwise e.g "hashtag" is wrongly segmented into ["hash", "tag"]
    # get indices of OOV words
    word_emb = glove.get_vecs_by_tokens(words, lower_case_backup=True)
    oov_word_mask = (~word_emb.bool().all(axis=1))
    segmented_words = []
    for i, word in enumerate(words):
        if oov_word_mask[i]:
            segmented_words += wordsegment.segment(word)
        else:
            segmented_words.append(word)
    if verbose:
        segmented_word_emb = glove.get_vecs_by_tokens(segmented_words, lower_case_backup=True)
        segmented_oov_word_mask = (~segmented_word_emb.bool().all(axis=1))
        for idx in np.where(segmented_oov_word_mask)[0]: # print OOV words
            print(segmented_words[idx])
    return ' '.join(segmented_words)

segment_if_needed("dummy tweet with easytosegment hashtag")

'dummy tweet with easy to segment hashtag'

In [None]:
## test set
X_test_txt = pd.read_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove.csv"))
X_test_txt['preprocessed_segmented_tweet'] = X_test_txt.progress_apply(( lambda row: segment_if_needed(row['preprocessed_tweet']) ),
                                                                       axis=1) # apply to each row
X_test_txt.drop('preprocessed_tweet', axis=1, inplace=True) # drop column
X_test_txt.to_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove_segmented.csv"),
                  index=False)

## limited train dataset
X_train_txt = pd.read_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_limited.csv"))
X_train_txt['preprocessed_segmented_tweet'] = X_train_txt.progress_apply(( lambda row: segment_if_needed(row['preprocessed_tweet']) ),
                                                                         axis=1) # apply to each row
X_train_txt.drop('preprocessed_tweet', axis=1, inplace=True) # drop column
X_train_txt.to_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_segmented_limited.csv"),
                   index=False)

## full train dataset (takes about 15 min)
X_train_txt = pd.read_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_full.csv"))
X_train_txt['preprocessed_segmented_tweet'] = X_train_txt.progress_apply(( lambda row: segment_if_needed(row['preprocessed_tweet']) ),
                                                                         axis=1) # apply to each row
X_train_txt.drop('preprocessed_tweet', axis=1, inplace=True) # drop column
X_train_txt.to_csv(os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_segmented_full.csv"),
                   index=False)

### Sample usage for training and evaluation

In [12]:
TWEETS_TRAIN_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_segmented_full.csv")
TWEETS_TEST_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove_segmented.csv")

In [13]:
## if we only use the preprocessed+segmented part, and not the raw part

# train data
X_train_txt = pd.read_csv(TWEETS_TRAIN_FILENAME)
# X_train_txt = pd.read_csv(TWEETS_TRAIN_FILENAME, nrows=100000) # for dev: only keep nrows first samples
n_samples = X_train_txt.shape[0]
y_train = X_train_txt['label'].to_numpy().astype(np.integer, copy=False)
assert y_train.shape == (n_samples,)
X_train_txt = X_train_txt['preprocessed_segmented_tweet'].to_numpy()
assert X_train_txt.shape == (n_samples,)
X_train_txt[0], y_train[0]

('<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me <hashtag> believe <number>',
 1)

In [14]:
# test data
X_test_txt = pd.read_csv(TWEETS_TEST_FILENAME)
X_test_txt = X_test_txt['preprocessed_segmented_tweet'].to_numpy()
assert X_test_txt.shape == (10000,)
X_test_txt[0]

'sea doo pro sea scooter ( sports with the portable seadoo sea scooter save air , stay longer in the water and . <repeat> <url>'

In [15]:
def mean_embed_tweet(tweet, verbose=False):
    words = tweet.lower().split()
    if len(words) == 0: # avoid program crashing...
        words = ["empty", "tweet"]
    word_emb = glove.get_vecs_by_tokens(words, lower_case_backup=True)
    if verbose:
        oov_word_mask = (~word_emb.bool().all(axis=1))
        for idx in np.where(oov_word_mask)[0]: # print OOV words
            print(words[idx])
    return word_emb.mean(axis=0) # or take (padded) concatenation, or weighted mean...

## choose the tweet embedding method.
my_embed_tweet = mean_embed_tweet

tweet_emb_dim = my_embed_tweet("this is a dummy tweet").shape[0]
tweet_emb_dim

200

In [17]:
X_train = np.empty((y_train.shape[0], tweet_emb_dim))
for i, tweet in enumerate(tqdm(X_train_txt)):
    # X_train[i] = my_embed_tweet(tweet, verbose=True) # to output OOV words encountered
    X_train[i] = my_embed_tweet(tweet)

# can even do better:
# X_train = pd.DataFrame(X_train_txt).progress_apply(( lambda x: mean_embed_tweet(x[0]) ), axis=1).to_numpy()

HBox(children=(FloatProgress(value=0.0, max=2500000.0), HTML(value='')))




In [18]:
# always use *the same preprocessing* for the test data!
X_test = np.empty((X_test_txt.shape[0], tweet_emb_dim))
for i, tweet in tqdm(enumerate(X_test_txt)):
    # X_test[i] = my_embed_tweet(tweet, verbose=True) # to output OOV words encountered
    X_test[i] = my_embed_tweet(tweet)

# can even do better:
# X_train = pd.DataFrame(X_train_txt).progress_apply(( lambda x: mean_embed_tweet(x[0]) ), axis=1).to_numpy()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### A little experiment...

What proportion of the words are OOV, after processing?

In [19]:
# (already loaded)

# TWEETS_TRAIN_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_segmented_full.csv")
# # train data
# X_train_txt = pd.read_csv(TWEETS_TRAIN_FILENAME)
# # X_train_txt = pd.read_csv(TWEETS_TRAIN_FILENAME, nrows=100000) # for dev: only keep nrows first samples
# n_samples = X_train_txt.shape[0]
# y_train = X_train_txt['label'].to_numpy().astype(np.integer, copy=False)
# assert y_train.shape == (n_samples,)
# X_train_txt = X_train_txt['preprocessed_segmented_tweet'].to_numpy()
# assert X_train_txt.shape == (n_samples,)
X_train_txt[0], y_train[0]

('<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me <hashtag> believe <number>',
 1)

In [20]:
# (already loaded)

# TWEETS_TEST_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove_segmented.csv")# test data
# X_test_txt = pd.read_csv(TWEETS_TEST_FILENAME)
# X_test_txt = X_test_txt['preprocessed_segmented_tweet'].to_numpy()
# assert X_test_txt.shape == (10000,)
X_test_txt[0]

'sea doo pro sea scooter ( sports with the portable seadoo sea scooter save air , stay longer in the water and . <repeat> <url>'

In [21]:
def count_oov_frequency(X_txt):
    # X_txt: a 1D numpy array of strings
    tot_words = 0
    oov_words = 0
    for i, tweet in enumerate(tqdm(X_txt)):
        words = tweet.lower().split()
        tot_words += len(words)
        word_emb = glove.get_vecs_by_tokens(words, lower_case_backup=True)
        oov_word_mask = (~word_emb.bool().all(axis=1))
        oov_words += np.count_nonzero(oov_word_mask)
    return oov_words, tot_words

oov_words_test, tot_words_test = count_oov_frequency(X_test_txt)
print(f"Proportion of OOV words in test set: {oov_words_test / tot_words_test} ({oov_words_test} / {tot_words_test})")

oov_words_train, tot_words_train = count_oov_frequency(X_train_txt)
print(f"Proportion of OOV words in training set: {oov_words_train / tot_words_train} ({oov_words_train} / {tot_words_train})")

oov_words_tot = oov_words_train + oov_words_test
tot_words_tot = tot_words_train + tot_words_test
print(f"Proportion of OOV words in training+test sets: {oov_words_tot / tot_words_tot} ({oov_words_tot} / {tot_words_tot})")

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Proportion of OOV words in test set: 0.0020046474108142602 (333 / 166114)


HBox(children=(FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


Proportion of OOV words in training set: 0.0020304250864463286 (84423 / 41578978)
Proportion of OOV words in training+test sets: 0.002030322510727728 (84756 / 41745092)
