In [1]:
from pathlib import Path
import fastText
import sklearn
import sklearn.metrics
import numpy as np
import re

In [2]:
root_dir = Path("..")
data_dir = root_dir / "data"
notebook_dir = root_dir / "notebooks"
model_dir = root_dir / "model" 

if not model_dir.exists():
    model_dir.mkdir()

In [3]:
data_path = data_dir / "twitter_las_vegas_shooting"
input_filename = str(data_path)
model_filename = str(model_dir / "twitter.bin")

# Preprocessing

In [4]:
# Preprocessing Config
preprocess_config = {
    "hashtag": True,
    "mentioned": True,
    "punctuation": True,
    "url": True,
}

# Pattern
hashtag_pattern = "#\w+"
mentioned_pattern = "@\w+"
url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

trans_str = "!\"$%&\'()*+,-./:;<=>?[\\]^_`{|}~" + "…"
translate_table = str.maketrans(trans_str, " " * len(trans_str))

def preprocess(s):
    s = s.lower()
    if preprocess_config["hashtag"]:
        s = re.sub(hashtag_pattern, "", s)
    if preprocess_config["mentioned"]:
        s = re.sub(mentioned_pattern, "", s)
    if preprocess_config["url"]:
        s = re.sub(url_pattern, "", s)
    if preprocess_config["punctuation"]:
        s = " ".join(s.translate(translate_table).split())
    return s


In [5]:
# example of preprocessing
example_twitter = "RT @TheLeadCNN: Remembering Keri Lynn Galvan, from Thousand Oaks, California. #LasVegasLost https://t.co/QuvXa6WvlE https://t.co/hDF2d3Owgn"
preprocess(example_twitter)

'rt remembering keri lynn galvan from thousand oaks california'

In [6]:
# Preprocessing
preprocessed_data_path = data_dir / "twitter_las_vegas_shooting.preprocessed"

with data_path.open() as f:
    lines = [l.strip() for l in f.readlines()]

with preprocessed_data_path.open("w") as f:
    for l in lines:
        f.write(preprocess(l))
        f.write("\n")

# use preprocessed data as input
input_filename = str(preprocessed_data_path)

# Training

In [7]:
# fastText Config
embedding_model = "skipgram"
lr = 0.05
dim = 100
ws = 5
epoch = 5
minCount = 5
minCountLabel = 0
minn = 3
maxn = 6
neg = 5
wordNgrams = 1
loss = "ns"
bucket = 2000000
thread = 12
lrUpdateRate = 100
t = 1e-4
verbose = 2

In [8]:
model = fastText.train_unsupervised(
    input = input_filename,
    model=embedding_model,
    lr=lr,
    dim=dim,
    ws=ws,
    epoch=epoch,
    minCount=minCount,
    minCountLabel=minCountLabel,
    minn=minn,
    maxn=maxn,
    neg=neg,
    wordNgrams=wordNgrams,
    loss=loss,
    bucket=bucket,
    thread=thread,
    lrUpdateRate=lrUpdateRate,
    t=t,
    verbose=verbose,
)

# Output model to disk if needed
model.save_model(model_filename)

In [9]:
# Load saved model if needed
model = fastText.load_model(model_filename)

# Query

In [10]:
words = np.array(model.get_words())
word_vectors = np.array([model.get_word_vector(w) for w in words])

In [11]:
def calc_n_cosine_neighbor(inX, X, N):
    if inX.ndim == 1:
        inX = [inX]
    distances = sklearn.metrics.pairwise.pairwise_distances(
        X, inX, metric="cosine")
    sortedDist = distances.reshape((distances.shape[0],)).argsort()
    return sortedDist[:N], distances

def nn(query, words=words, word_vectors=word_vectors, k=10):
    """
    words: numpy array of words
    k: (optional, 10 by default) top k labels
    """
    global model
    v = model.get_word_vector(query)
    idx, _ = calc_n_cosine_neighbor(v, word_vectors, k)
    return words[idx]

In [12]:
nn("lasvegasshooting", k=10)

array(['shooting', 'lasvegas', 'vegas”', 'vegas', 'las', 'vega',
       '“shooting”', 'shootin', 'shooting”', '</s>'], dtype='<U23')