## Make vocabulary file

The vocabulary file Albert comes with is trained on Wikipedia, which will not have words like MAGA or other tweet slang.  So, let's train our own twitter vocabulary file here.

In [35]:
# Import Russian Troll Tweets dataset

import glob
import pandas as pd

folder = "data/russian-troll-tweets-master"
russian_tweets = pd.concat([pd.read_csv(file, dtype='str') for file in glob.glob(folder + '/*.csv')])

russian_tweets = russian_tweets[russian_tweets['language'] == 'English']  # English tweets only.
russian_tweets = russian_tweets[pd.notnull(russian_tweets['content'])]  # drop null tweets

print(russian_tweets.shape)
print(russian_tweets[russian_tweets['account_category'] == 'RightTroll'].iloc[0]['content'])
russian_tweets.keys()

(2116866, 21)
Come support our peace officers. Our choice is law & order. #BlueLivesMatter  Rally link:  https://t.co/PuKr7nBBxP https://t.co/QpCg8jRFkN


Index(['external_author_id', 'author', 'content', 'region', 'language',
       'publish_date', 'harvested_date', 'following', 'followers', 'updates',
       'post_type', 'account_type', 'retweet', 'account_category',
       'new_june_2018', 'alt_external_id', 'tweet_id', 'article_url',
       'tco1_step1', 'tco2_step1', 'tco3_step1'],
      dtype='object')

In [33]:
# Import Twitter Stream dataset

import pandas as pd

stream_tweets = pd.read_json("data/en_tweets.json", dtype='str')

print(stream_tweets.shape)
print(stream_tweets[stream_tweets['lang'] == 'en'].count()[0])
print(stream_tweets['text'][0])
stream_tweets.keys()

(100000, 37)
100000
RT @beastieboys: “Wait, what!?! I just heard that Mike &amp; Adam made a movie about Beastie Boys with Spike Jonze. Is that for real?" - Larry…
Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'retweeted_status', 'is_quote_status', 'quote_count',
       'reply_count', 'retweet_count', 'favorite_count', 'entities',
       'favorited', 'retweeted', 'filter_level', 'lang', 'timestamp_ms',
       'display_text_range', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status', 'quoted_status_permalink', 'possibly_sensitive',
       'extended_tweet', 'extended_entities', 'withheld_in_countries'],
      dtype='object')


In [43]:
# Clean tweets / normalize tweets
# Sentencepiece states "one-sentence-per-line raw corpus file. No need to run tokenizer, normalizer or preprocessor."

# Replace newlines with spaces, because sentencepiece does not like newlines.
russian_tweets['content'].replace('\n', ' ', regex=True, inplace=True)
stream_tweets['text'].replace('\n', ' ', regex=True, inplace=True)

# Lowercase all the tweets
russian_tweets['content'] = russian_tweets['content'].str.lower()
stream_tweets['text'] = stream_tweets['text'].str.lower()

In [44]:
# Create combined dataset

import pandas as pd

# We only need the tweet text for this.
combined_tweets = pd.concat([russian_tweets['content'], stream_tweets['text']]).values

print(len(combined_tweets))
combined_tweets

2216866


array(['police: airline pilot found passed out in cockpit was drunk https://t.co/gcqwioeswj https://t.co/2agimsvzmk',
       'dashcam video shows man launching himself onto police cruiser https://t.co/2zgpbau4ey https://t.co/b2fknfpy5v',
       'man arrested for setting fire to south sf medical clinic new year’s day https://t.co/maqc2gzvhv https://t.co/wgxyh0ifto',
       ..., 'rt @more_milf: the music, the crowd! she stole the show!',
       '@itaintwhiteboy @chadjohnwallis i used to like him in the past, i thought that he was a pretty normal bernie bros..… https://t.co/wg4fav3tlg',
       '@elise_flowers always my queen always ❤🙏🏾 you take care ❤'],
      dtype=object)

In [45]:
# Write to file in format SentencePiece expects.

with open("data/vocab/sentences.txt", "w", encoding="utf-8") as file:
    for t in combined_tweets:
        file.write(t+"\n")

In [46]:
# Run Sentencepiece to make the vocab file for Albert

# !pip3 install sentencepiece

import sentencepiece as spm

# Settings grabbed from https://github.com/google-research/ALBERT#sentencepiece
spm_settings = r'''
--input=data/vocab/sentences.txt --model_prefix=data/vocab/twitter-30k --vocab_size=30000
--pad_id=0 --unk_id=1 --eos_id=-1 --bos_id=-1 --control_symbols=[CLS],[SEP],[MASK]
--user_defined_symbols=(,),\”,-,.,–,£,€ --input_sentence_size=10000000
--character_coverage=0.99995 --model_type=unigram
'''
spm_settings = spm_settings.replace('\n',' ')

spm.SentencePieceTrainer.Train(spm_settings)

True

In [47]:
# Check vocab files

import random

def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

In [51]:
# Check custom Twitter vocab

twitter_vocab = read_sentencepiece_vocab("data/vocab/twitter-30k.vocab")
print("Learned vocab size: {}".format(len(twitter_vocab)))
random.seed(124)
print("Sample tokens: {}".format(random.sample(twitter_vocab, 10)))

Learned vocab size: 29999
Sample tokens: ['pkk', '▁indonesian', 'my', 'span', 'kpxt', 'nka', 'jcz', '▁apologist', '▁wives', '▁arrest']


In [49]:
# Check vocab that came with Albert

albert_vocab = read_sentencepiece_vocab("model/assets/30k-clean.vocab")
print("Learned vocab size: {}".format(len(albert_vocab)))
random.seed(14)
print("Sample tokens: {}".format(random.sample(albert_vocab, 10)))

Learned vocab size: 29999
Sample tokens: ['▁represent', '▁truss', '▁practised', '▁interacting', '▁pare', '▁guiding', '▁fen', 'hoo', '▁believers', '▁stanford']
