In [1]:
# NLTK Tokenizer
from nltk import word_tokenize
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

# Word2Vec Implementation
from gensim.models import Word2Vec

from collections import OrderedDict

import multiprocessing as mp
from contextlib import closing
from itertools import chain
from glob import glob

from tqdm import tqdm



In [2]:
def load_comments(filepath):
    all_files = glob(filepath)
    return pd.concat((pd.read_csv(f) for f in all_files))

comment_df = load_comments("data/nba*")

print("Total number of comments found: {}".format(len(comment_df)))

Total number of comments found: 10916598


In [3]:
def tokenize_comments(comments):
    stopset = set(stopwords.words('english'))
    tokenized_comments = []
    for comment in tqdm(comments):
        tokens = word_tokenize(str(comment).lower())
        tokens = [w for w in tokens if not w in stopset]
        tokenized_comments.append(tokens)
    return tokenized_comments

tokenized_comments = tokenize_comments(comment_df['body'].values)

100%|██████████| 10916598/10916598 [39:09<00:00, 4645.56it/s]


In [4]:
print("Completed tokenizing {} comments".format(len(tokenized_comments)))

Completed tokenizing 10916598 comments


In [5]:
model = Word2Vec(tokenized_comments, min_count=5, size=300, workers=mp.cpu_count())
print("Vocabulary Size: {}".format(len(model.wv.vocab)))

Vocabulary Size: 151550


In [60]:
def write_files(model, model_filename="model.tsv", metadata_filename="metadata.tsv"):
    keys = model.wv.vocab.keys()
    with open(model_filename, "w") as model_file:
        for key in keys:
            embedding = list(model[key])
            model_file.write('\t'.join(map(str, embedding)))
            model_file.write('\n')
    print("Model file successfully written to: {}".format(model_filename))
    
    with open(metadata_filename, "wb") as metadata_file:
        metadata_file.write('Word\n'.encode())
        for key in keys:
            metadata_file.write(key.encode('utf-8'))
            metadata_file.write('\n'.encode())
    print("Metadata file successfully written to: {}".format(metadata_filename))  
    
write_files(model, "output/model.tsv", "output/metadata.tsv")

Metadata file successfully written to: output/metadata.tsv


In [7]:
model.save("models/nba")

In [129]:
def determine_rankings(term, limit=15, sort_desc=True, threshold=0.2):
    similarities = {}
    for player in ['MJ', 'Magic', 'KD', 'Steph', 'LeBron', 'Harden', 'CP3', 'Giannis', 'Draymond', 'Klay', 'Rose', \
                  'Melo', 'Westbrook', 'Stockton', 'Iverson', 'Embiid', 'Duncan', 'Garnett', 'Pierce', 'Kobe', 'Shaq', \
                  'Kareem', 'Wilt', 'Scalabrine', 'Worthy', 'Malone', 'Ewing', 'Dwight', 'Barkley', 'Lillard', 'Wall', \
                  'Beal', 'Simmons', 'Fultz', 'Lonzo', 'Humphries', 'Odom', 'Bennett', 'Oden', 'Gay', 'Foye', 'Jahlil', \
                  'Emeka', 'MKG', 'Kemba', 'MCW', 'Rondo', 'Blake', 'Nate', 'Iggy', 'Oladipo', 'Tatum', 'Flynn', 'Rubio', \
                  'Wade', 'Jimmy', 'KAT', 'Wiggins', 'Korver', 'Deng', 'Redick', 'Webber', 'Rivers', 'Kwame', 'AD', 'Javale', \
                  'Mozgov', 'Deng', 'Lowry', 'Tristan', 'JR', 'Monta', 'Derozan']:
        if sort_desc:
            similarities[player] = 1 - abs(model.similarity(player.lower(), term.lower()))
        else:
            similarities[player] = abs(model.similarity(player.lower(), term.lower()))
    output = OrderedDict(sorted(similarities.items(), key = lambda t: t[1]))
    print("{} Rankings:".format(term))
    print()
    for i, key in enumerate(output):
        if i < limit:
            if sort_desc and 1 - output[key] >= threshold:
                print("{}: {}".format(key, 1 - output[key]))
            else:
                if output[key] <= threshold:
                    print("{}: {}".format(key, output[key]))

determine_rankings("thicc", limit=10, sort_desc=True, threshold=0.2)

thicc Rankings:

Lowry: 0.20296446289452752


In [123]:
def doesnt_belong(items):
    return model.wv.doesnt_match(items.split())

print(doesnt_belong("lonzo liangelo lavar lamelo basketball"))

basketball


In [106]:
model.most_similar("gone")

[('go', 0.48412632942199707),
 ('gotten', 0.4789046049118042),
 ('went', 0.4782710671424866),
 ('resigned', 0.46715933084487915),
 ('re-signed', 0.4587304890155792),
 ('stayed', 0.44694089889526367),
 ('taken', 0.426818311214447),
 ('done', 0.4104292392730713),
 ('fallen', 0.3936896026134491),
 ('contended', 0.39327389001846313)]

In [None]:
word_vectors = model.wv
del model

In [91]:
model = word_vectors