In [1]:
import findspark
# my local spark install
findspark.init('/Users/dreyco676/spark-1.6.0-bin-hadoop2.6/')

import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import HashingTF, Tokenizer

sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)

In [4]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("/Users/dreyco676/nlp_spark/data/cleaned_training.txt")
parts = lines.map(lambda l: l.split("\t"))
# Filter bad rows out
garantee_col = parts.filter(lambda l: len(l) == 3)
training = garantee_col.map(lambda p: (p[0], p[1], p[2]))
# Create DataFrame
text_label_df = sqlContext.createDataFrame(training, ["text", "id", "text_label"])

In [5]:
# convert the text label into a numeric one
def numeric_label(data_str):
    lower_str = data_str.lower()
    if lower_str == 'python':
        label = 0.0
    elif lower_str == 'hadoop':
        label = 1.0
    elif lower_str == 'datasci':
        label = 2.0
    else:
        label = 3.0
    return label

numeric_label_udf = udf(numeric_label, DoubleType())

num_label_df = text_label_df.withColumn("label", numeric_label_udf(text_label_df['text_label']))
data_set = num_label_df.select(num_label_df['id'], num_label_df['text'], num_label_df['label'])

In [9]:
# split training & validation sets with 60% to training and use a seed value of 1987
splits = data_set.randomSplit([0.6, 0.4], 1987)
training_df = splits[0]
test_df = splits[1]

save_format = training_df.map(lambda row: str(row[0]+'\t'+row[1]+'\t'+str(row[2])))
save_format.saveAsTextFile('data/clean_training_split')
save_format = test_df.map(lambda row: str(row[0]+'\t'+row[1]+'\t'+str(row[2])))
save_format.saveAsTextFile('data/clean_test_split')

In [None]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
pipeline = Pipeline(stages=[tokenizer, hashingTF, nb])

# Fit the pipeline to training documents.
model = pipeline.fit(training_df)

In [None]:
# Make predictions on test documents and print columns of interest.
result = model.transform(test_df)
result.prediction

result.probability

result.rawPrediction

selected = result.select("id", "text", "prediction")
for row in selected.collect():
    print(row)