In [1]:
import findspark
# my local spark install
findspark.init('/Users/dreyco676/spark-1.6.0-bin-hadoop2.6/')

import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sc = pyspark.SparkContext(appName="tweet_classifier")
sqlContext = SQLContext(sc)

In [2]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("/Users/dreyco676/nlp_spark/data/cleaned_training.txt")
parts = lines.map(lambda l: l.split("\t"))
# Filter bad rows out
garantee_col = parts.filter(lambda l: len(l) == 2)
training = garantee_col.map(lambda p: (p[0], p[1]))
# Create DataFrame
training_df = sqlContext.createDataFrame(training, ["tweet", "classification"])

In [4]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(training_df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "classification").take(3):
    print(features_label)

Row(features=SparseVector(20, {8: 1.0795, 13: 1.4519, 19: 1.4843}), classification='python')
Row(features=SparseVector(20, {1: 1.3161, 5: 1.1959, 7: 1.2182, 8: 1.0795, 9: 1.3429, 13: 1.4519, 17: 1.3048}), classification='python')
Row(features=SparseVector(20, {0: 2.2837, 1: 1.3161, 4: 1.3906, 7: 2.4364, 8: 1.0795, 9: 1.3429, 10: 0.9202, 16: 1.364, 18: 1.2026, 19: 1.4843}), classification='python')


In [5]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("/Users/dreyco676/nlp_spark/data/cleaned_unclassified.txt")
parts = lines.map(lambda l: l.split("\t"))
# Filter bad rows out
garantee_col = parts.filter(lambda l: len(l) == 2)
unclassified = garantee_col.map(lambda p: (p[0], p[1]))
# Create DataFrame
unclassified_df = sqlContext.createDataFrame(training, ["tweet", "tweet_id"])

In [None]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(unclassified_df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "tweet_id").take(3):
    print(features_label)