In [1]:
import pandas as pd
import h2o
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# Read in files from our previous .ipynb: "TweetGetter"
trump_tweets = pd.read_csv('realdonaldtrump_tweets.csv')
clinton_tweets = pd.read_csv('hillaryclinton_tweets.csv')

# Assign labels to each DataFrame
trump_tweets["tweets_author"] = "Trump"
clinton_tweets["tweets_author"] = "Clinton"

# Concatenate the two into one DataFrame, and clean the data
tweets = pd.concat([trump_tweets, clinton_tweets])
tweets = tweets[[ u'id', u'text', u'tweets_author']]
tweets["tweets_id"] = tweets["id"].astype("str") 
tweets["tweets_text"] = tweets["text"].apply(lambda x: x.decode('utf-8'))
tweets = tweets.reset_index()
tweets = tweets.drop(["index", "text", "id"],1)
tweets.head()

Unnamed: 0,tweets_author,tweets_id,tweets_text
0,Trump,783436108176629760,Thank you ARIZONA! This is a MOVEMENT like nob...
1,Trump,783393314309484544,My childcare plan makes a difference for worki...
2,Trump,783391423663964160,I will be watching the great Governor @Mike_Pe...
3,Trump,783390310969651200,"Join me in Reno, Nevada tomorrow at 3:30pm! #A..."
4,Trump,783149570721144832,"Join me in Reno, Nevada on Wednesday at 3:30pm..."


In [14]:
# Take just the Tweets text
tfidf_tweets = tweets["tweets_text"].values

# Initialize our TF-IDF Vectorizer & create our (sparse) TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_tweets)

# Convert our matrix to a dense matrix and convert to a DataFrame, adding the actual column names
dense = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_vectorizer.get_feature_names())

# Re-join with our original dataset
output_tweets = pd.concat([tweets, dense], 1)
output_tweets = output_tweets.drop("tweets_text", 1)
output_tweets.to_csv("tfidf_tweets.csv", encoding='utf-8') # Takes a long time -- very large dataset!

In [26]:
# Start our modeling using H2O
# You can do so in H2O Flow: 
# After running H2O, run file "NLPTweets.flow"

h2o.init(nthreads = -1)

Connecting to H2O server at http://localhost:54321....... failed.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_101"; Java(TM) SE Runtime Environment (build 1.8.0_101-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.101-b13, mixed mode)
  Starting server from /Users/jaymahabal/anaconda/h2o_jar/h2o.jar
  Ice root: /var/folders/0p/j0cz67k54b51w765vbyvq69m0000gn/T/tmpBQTMOe
  JVM stdout: /var/folders/0p/j0cz67k54b51w765vbyvq69m0000gn/T/tmpBQTMOe/h2o_jaymahabal_started_from_python.out
  JVM stderr: /var/folders/0p/j0cz67k54b51w765vbyvq69m0000gn/T/tmpBQTMOe/h2o_jaymahabal_started_from_python.err
Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful!
