## Load Data Using Spark

In [169]:
import numpy as np
import pandas as pd
import json
import re

In [170]:
import matplotlib.pyplot as plt
%matplotlib inline

In [171]:
video_game_tweets_training = pd.read_pickle("saved_pickles/video_game_tweets_training.pkl")
video_game_tweets_validation = pd.read_pickle("saved_pickles/video_game_tweets_validation.pkl")

In [172]:
video_game_tweets = pd.concat([video_game_tweets_training, video_game_tweets_validation])

# Get Video Game Keyword List

In [173]:
video_game_keyword_list = open("Video_Game_Keyword_List.txt", "r").read().split("\n")
video_game_keyword_list.remove('')

In [174]:
from sklearn.externals import joblib

vectorizer = joblib.load("saved_pickles/lsa_vectorizer.pkl")
svd = joblib.load("saved_pickles/lsa_svd.pkl")

In [175]:
from lib.lsa import run_lsa

video_game_tweets_lsa = run_lsa(video_game_tweets, vectorizer, svd)

#video_game_tweets_training_lsa = run_lsa(video_game_tweets_training, vectorizer, svd)
#video_game_tweets_validation_lsa = run_lsa(video_game_tweets_validation, vectorizer, svd)

# Relative Prevalence of Keywords

In [193]:
keyword_classes = video_game_tweets_training["keyword_matches"]

keyword_relevance_df = pd.DataFrame([{"Keyword":word, \
                                      "Percentage":100.0 * np.mean(keyword_classes == word),
                                      "Total":np.sum(keyword_classes == word)} \
                                     for word in list(set(keyword_classes))]).sort_values("Percentage",
                                                                                          ascending=False)

keyword_relevance_df.head(10)


Unnamed: 0,Keyword,Percentage,Total
7,Zelda,35.056,8764
61,Overwatch,11.116,2779
79,Pokemon,7.452,1863
57,Minecraft,5.26,1315
29,Mario,5.196,1299
39,Halo,3.072,768
16,Sonic,3.052,763
8,Mass Effect,2.98,745
22,Resident Evil,2.592,648
76,Call of Duty,2.108,527


# Group Tweets by Keyword Match

In [176]:
keyword_corpus = pd.DataFrame(video_game_tweets.groupby("keyword_matches")["cleaned_text"].agg(lambda x: " ".join(x)))

In [177]:
keyword_corpus.head()

Unnamed: 0_level_0,cleaned_text
keyword_matches,Unnamed: 1_level_1
Angry Birds,blast level 284 blast level 285 blast level 28...
Animal Crossing,"what is when i play , i use pinterest to find ..."
Assassin's Creed,if you missed it won the vote. brings back mem...
BioShock,how quickly can you complete a hacking puzzle?...
Block Breaker,art - hiro hamada - i love this film so much a...


# Perform Count Vectorization

In [178]:
from sklearn.feature_extraction.text import CountVectorizer

In [179]:
cv = CountVectorizer(max_df = 0.25, min_df = 2, stop_words=None)

In [180]:
cv.fit(keyword_corpus["cleaned_text"])
video_game_tweets_cv = pd.DataFrame(cv.transform(keyword_corpus["cleaned_text"]).todense())
#video_game_tweets_cv.columns = cv.get_feature_names

In [181]:
video_game_tweets_cv.columns = cv.get_feature_names()

In [182]:
cv.fit_transform(keyword_corpus["cleaned_text"])

<86x9821 sparse matrix of type '<type 'numpy.int64'>'
	with 49120 stored elements in Compressed Sparse Row format>

In [183]:
video_game_tweets_cv.head()

Unnamed: 0,00,000,006,007,01,02,03,04,05,06,...,zone,zoned,zones,zoom,ʸᵒᵘ,обзор,персонаж,דרך,منارة_التقنية,パズドラ
0,0,0,0,0,0,0,13,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Generate a Clustering Algorithm

In [184]:
from lib.lsa import lsa_pipeline

In [185]:
keyword_corpus_lsa, vectorizer, svd = lsa_pipeline(keyword_corpus, max_df = 0.5, min_df = 2,
                                                   stop_words="english", num_components=50)

In [186]:
keyword_corpus_lsa.shape

(86, 50)

In [187]:
keyword_corpus.index.shape

(86,)

In [188]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score

In [136]:
#tweet_subset = video_game_tweets_training_lsa[video_game_tweets_training["keyword_matches"].apply( \
#                                                        lambda word: word in restricted_keyword_classes)]

silhouettes = []

for n in range(2,21):

    kmeans = KMeans(n_clusters= n)
    kmeans.fit(keyword_corpus_lsa)

    game_labels = kmeans.predict(keyword_corpus_lsa)
    score = silhouette_score(keyword_corpus_lsa, game_labels)

    silhouettes.append({"N":n, "Silhouette Score":score})

silhouettes_df = pd.DataFrame(silhouettes)

In [137]:
silhouettes_df

Unnamed: 0,N,Silhouette Score
0,2,-0.006199
1,3,-0.01428
2,4,-0.019014
3,5,0.001444
4,6,0.000443
5,7,0.021026
6,8,0.017772
7,9,-0.001456
8,10,0.020238
9,11,0.023304


In [150]:
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(keyword_corpus_lsa)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [151]:
keyword_corpus["Cluster"] = kmeans.predict(keyword_corpus_lsa)

In [152]:
{i:list(keyword_corpus[keyword_corpus["Cluster"] == i].index) for i in range(num_clusters)} 

{0: ['Animal Crossing',
  'Metroid',
  'Shadow of the Colossus',
  'Splatoon',
  'Xenogears',
  'Zelda'],
 1: ['Call of Duty',
  'God of War',
  'Grand Theft Auto',
  'Kingdom Hearts',
  'Madden NFL',
  'Skyrim',
  'Star Wars: Battlefront',
  'The Elder Scrolls',
  'Uncharted'],
 2: ['Chrono Trigger',
  'EarthBound',
  'Final Fantasy',
  'Mass Effect',
  'Mortal Combat',
  'Shenmue',
  'SoulCalibur',
  'Star Fox',
  'Tomb Raider'],
 3: ["Assassin's Creed", 'Last of Us', 'Silent Hill', 'Street Fighter'],
 4: ['Candy Crush',
  'Dark Souls',
  'Deus Ex',
  'Diablo',
  'Donkey Kong',
  'Duck Hunt',
  'Fire Emblem',
  'Fruit Ninja',
  'Galaga',
  'Galaxian',
  "Garry's Mod",
  'Halo',
  'Ico',
  'Lemmings',
  'Mario',
  'Minecraft',
  'Monster Hunter',
  'Morrowind',
  'Myst',
  'Need for Speed',
  'Overwatch',
  'Pac-Man',
  'Perfect Dark',
  'Pong',
  'Portal',
  'Sonic',
  'StarCraft',
  'Super Smash',
  'Terraria',
  'The Sims',
  'Wii Fit',
  'Wii Sports'],
 5: ['Metal Gear'],
 6: ['Ca

In [None]:
cluster_ordering = pd.DataFrame([{i:word for i,word in \
    enumerate(sorted([(word, 1.0 * np.sum((video_game_tweets_training["Cluster"] == j) &
                                          (video_game_tweets_training["keyword_matches"] == word)) /
                       np.sum(video_game_tweets_training["keyword_matches"] == word)) for word in list(set(keyword_classes))],
                    reverse = True, key=lambda x: x[1]))}
                                 for j in range(kmeans.n_clusters)]
                               ).T

In [189]:
video_game_tweets_training

Unnamed: 0,text,keyword_matches,cleaned_text,game_related_tweet
0,Zelda's super neat but I've experienced more s...,Zelda,'s super neat but i've experienced more severe...,1
1,RT @NintendoAmerica: The adventure begins. #Ni...,Zelda,the adventure begins.,1
2,.@Ingeborgburger @Derpfield you two know me so...,Pokemon,. you two know me so well you both sent me tot...,1
6,RT @jacksfilms: Still in shock over IGN's Zeld...,Zelda,still in shock over ign's review,1
7,RT @AngryJoeShow: Just got back home! So you h...,Zelda,just got back home! so you have an idea this i...,1
8,RT @NintendoAmerica: The adventure begins. #Ni...,Zelda,the adventure begins.,1
9,Has #Overwatch Had Its Release Date Leaked? #O...,Overwatch,has had its release date leaked?,1
12,Overwatch Cinematic Teaser: Are You With Us? #...,Overwatch,cinematic teaser: are you with us?,1
13,How To Spot The Difference Between Battleborn ...,Overwatch,how to spot the difference between battleborn and,1
15,"RT @MelSmithJones: I analyze providers, and @S...",Portal,"i analyze providers, and is the most promising...",1


In [190]:
video_game_tweets_grouped =\
    video_game_tweets_training[["keyword_matches", "cleaned_text"]].groupby("keyword_matches").agg("sum")

In [191]:
video_game_tweets_grouped["cleaned_text"]

keyword_matches
Angry Birds               blast level 284blast level 285blast level 286a...
Animal Crossing           what iswhen i play , i use pinterest to find s...
Assassin's Creed          if you missed it won the vote. brings back mem...
BioShock                  how quickly can you complete a hacking puzzle?...
Call of Duty              infinite warfare attempting to get the wonder ...
Candy Crush               saga level viai love , but i love csgospeed ev...
Castlevania               i liked a video from sotn - the tragic prince ...
Chrono Trigger            - ️- ️- ️- ️- ️- ️- ️- ️- ️- ️- ️- ️- ️- ️- ️-...
Counter-Strike            no. global offensive!)i love seeing the commun...
Crysis                    remember the time when v. 20 went thorough a o...
Dark Souls                omfg give me a lightsaber with combat and i ca...
Deus Ex                   "a well-designed level that forces you out of ...
Diablo                    something new needs to happen. such boring con

In [153]:
tsne = TSNE(n_components=num_clusters)

In [154]:
tsne.fit(keyword_corpus_lsa)

TSNE(angle=0.5, early_exaggeration=4.0, init='random', learning_rate=1000.0,
   method='barnes_hut', metric='euclidean', min_grad_norm=1e-07,
   n_components=10, n_iter=1000, n_iter_without_progress=30,
   perplexity=30.0, random_state=None, verbose=0)

In [155]:
tweets_tsne = tsne.fit_transform(keyword_corpus_lsa)