## Load Data Using Spark

In [4]:
import numpy as np
import pandas as pd
import json
import re

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from lib.spark_mongo_interface import load_tweets_from_mongo

In [7]:
video_game_tweets = load_tweets_from_mongo(collection = "video_game_tweets")
unfiltered_tweets = load_tweets_from_mongo(collection = "unfiltered_tweets")

# Get Video Game Keyword List

In [None]:
from keyword_handling import restrict_by_keywords
from keyword_handling import keywords_in_tweet

video_game_keyword_list = open("Video_Game_Keyword_List.txt", "r").read().split("\n")
video_game_keyword_list.remove('')

video_game_tweets = restrict_by_keywords(video_game_tweets, 1, video_game_keyword_list)
unfiltered_tweets = restrict_by_keywords(unfiltered_tweets, 0, video_game_keyword_list)

In [None]:
video_game_tweets.to_pickle("saved_pickles/unprocessed_video_game_tweets.pkl")
unfiltered_tweets.to_pickle("saved_pickles/unprocessed_unfiltered_tweets.pkl")

# Get the Dictionary of Proper Names

In [None]:
from name_dictionary import get_proper_name_dict
proper_names_dict = get_proper_name_dict(500)

# Tweet Cleaning from Script Functions

In [None]:
from tweet_cleaning import clean_tweet_df

video_game_tweets = clean_tweet_df(video_game_tweets, keyword_list = video_game_keyword_list,
                                   name_dict = proper_names_dict, remove = True)

unfiltered_tweets = clean_tweet_df(unfiltered_tweets, keyword_list = video_game_keyword_list,
                                   name_dict = proper_names_dict, remove = True)

In [None]:
video_game_tweets.to_pickle("saved_pickles/cleaned_video_game_tweets.pkl")
unfiltered_tweets.to_pickle("saved_pickles/cleaned_unfiltered_tweets.pkl")

# Add Classes and Merge Dataframes into Training and Validation Sets 

In [None]:
video_game_tweets["saved_pickles/game_related_tweet"] = 1
unfiltered_tweets["saved_pickles/game_related_tweet"] = 0

In [None]:
video_game_tweets_training = video_game_tweets.head(25000)
unfiltered_tweets_training = unfiltered_tweets.head(25000)
video_game_tweets_validation = video_game_tweets.tail(video_game_tweets.shape[0] - 25000)
unfiltered_tweets_validation = unfiltered_tweets.tail(unfiltered_tweets.shape[0] - 25000)

video_game_tweets_training.to_pickle("saved_pickles/video_game_tweets_training.pkl")
unfiltered_tweets_training.to_pickle("saved_pickles/unfiltered_tweets_training.pkl")
video_game_tweets_validation.to_pickle("saved_pickles/video_game_tweets_validation.pkl")
unfiltered_tweets_validation.to_pickle("saved_pickles/unfiltered_tweets_validation.pkl")

In [None]:
all_tweets_training = pd.concat([video_game_tweets_training, unfiltered_tweets_training])
all_tweets_validation = pd.concat([video_game_tweets_validation, unfiltered_tweets_validation])

all_tweets_training.to_pickle("saved_pickles/all_tweets_training.pkl")
all_tweets_validation.to_pickle("saved_pickles/all_tweets_validation.pkl")