In [1]:
import findspark
# my local spark install
findspark.init('/Users/dreyco676/spark-1.6.0-bin-hadoop2.6/')

In [3]:
import pyspark
from pyspark.sql import SQLContext

# create spark contexts
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType
import preproc as pp
# Register all the functions in Preproc with Spark Context
check_lang_udf = udf(pp.check_lang, StringType())
remove_stops_udf = udf(pp.remove_stops, StringType())
remove_features_udf = udf(pp.remove_features, StringType())
tag_and_remove_udf = udf(pp.tag_and_remove, StringType())
lemmatize_udf = udf(pp.lemmatize, StringType())
check_blanks_udf = udf(pp.check_blanks, StringType())
numeric_label_udf = udf(pp.numeric_label, DoubleType())

In [5]:
# Load a text file and convert each line to a Row.
data_rdd = sc.textFile("data/raw_classified.txt")
parts_rdd = data_rdd.map(lambda l: l.split("\t"))
# Filter bad rows out
garantee_col_rdd = parts_rdd.filter(lambda l: len(l) == 3)
# Create DataFrame
data_df = sqlContext.createDataFrame(garantee_col_rdd, ["text", "id", "text_label"])

In [None]:
# predict language and filter out those with less than 90% chance of being English
lang_df = data_df.withColumn("lang", check_lang_udf(data_df["text"]))
en_df = lang_df.filter(lang_df["lang"] == "en")

In [None]:
# remove stop words to reduce dimensionality
rm_stops_df = en_df.withColumn("stop_text", remove_stops_udf(en_df["text"]))

In [None]:
# remove other non essential words, think of it as my personal stop word list
rm_features_df = rm_stops_df.withColumn("feat_text", remove_features_udf(rm_stops_df["stop_text"]))

In [None]:
# tag the words remaining and keep only Nouns, Verbs and Adjectives
tagged_df = rm_features_df.withColumn("tagged_text", tag_and_remove_udf(rm_features_df["feat_text"]))

In [None]:
# lemmatization of remaining words to reduce dimensionality & boost measures
lemm_df = tagged_df.withColumn("lemm_text", lemmatize_udf(tagged_df["tagged_text"]))

In [None]:
# remove all rows containing only blank spaces
check_blanks_df = lemm_df.withColumn("is_blank", check_blanks_udf(lemm_df["lemm_text"]))
no_blanks_df = check_blanks_df.filter(check_blanks_df["is_blank"] == "False")

In [None]:
num_label_df = no_blanks_df.withColumn("label", numeric_label_udf(no_blanks_df['text_label']))

In [None]:
# rename columns
num_label_df.withColumnRenamed(num_label_df["lemm_text"], "text")

In [None]:
# dedupe important since alot of the tweets only differed by url's and RT mentions
dedup_df = num_label_df.dropDuplicates(['text', 'label'])

In [None]:
# select only the columns we care about
data_set = dedup_df.select(num_label_df['id'], num_label_df['text'], num_label_df['label'])

In [None]:
# split training & validation sets with 60% to training and use a seed value of 1987
splits = data_set.randomSplit([0.6, 0.4])
training_df = splits[0]
test_df = splits[1]

In [None]:
##################################################################
#
#   Spark ML Section
#   
#   Skip Preprocessing and use cleaned files by running next cell
#
##################################################################

In [4]:
# Load already cleaned data
def reload_checkpoint(data_rdd):
    parts_rdd = data_rdd.map(lambda l: l.split("\t"))
    # Filter bad rows out
    garantee_col_rdd = parts_rdd.filter(lambda l: len(l) == 3)
    typed_rdd = garantee_col_rdd.map(lambda p: (p[0], p[1], float(p[2])))
    # Create DataFrame
    df = sqlContext.createDataFrame(typed_rdd, ["id", "text", "label"])
    return df


# Load precleaned training set
training_rdd = sc.textFile("data/clean_training.txt")
training_df = reload_checkpoint(training_rdd)
# Load precleaned test set
test_rdd = sc.textFile("data/clean_test.txt")
test_df = reload_checkpoint(test_rdd)

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
nb = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, nb])


paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()


cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

cvModel = cv.fit(training_df)

In [8]:
result = cvModel.transform(test_df)
prediction_df = result.select("text", "label", "prediction")

In [9]:
datasci_df = prediction_df.filter(prediction_df['label']==0.0)
datasci_df.show()

+--------------------+-----+----------+
|                text|label|prediction|
+--------------------+-----+----------+
|big boy toy drone...|  0.0|       1.0|
|big data result d...|  0.0|       1.0|
|big data service ...|  0.0|       0.0|
|       big data wild|  0.0|       0.0|
|bigdata algorithm...|  0.0|       0.0|
|bigdata analytics...|  0.0|       0.0|
|cognitive technol...|  0.0|       0.0|
|company fight rec...|  0.0|       1.0|
|data asset inconv...|  0.0|       0.0|
|data lover kirk b...|  0.0|       0.0|
|data science hunc...|  0.0|       0.0|
|datascientists ne...|  0.0|       0.0|
|facebook data sci...|  0.0|       1.0|
|foodindustry plan...|  0.0|       1.0|
|future datascienc...|  0.0|       0.0|
|gonzalezcarmen th...|  0.0|       0.0|
|important editori...|  0.0|       0.0|
|intel doubling co...|  0.0|       0.0|
|iot analytics edg...|  0.0|       0.0|
|irish start ups m...|  0.0|       1.0|
+--------------------+-----+----------+
only showing top 20 rows



In [10]:
ao_df = prediction_df.filter(prediction_df['label']==1.0)
ao_df.show()

+--------------------+-----+----------+
|                text|label|prediction|
+--------------------+-----+----------+
|acolyte warmachin...|  1.0|       1.0|
|alfred producthun...|  1.0|       0.0|
|alternative thank...|  1.0|       1.0|
|america greatest ...|  1.0|       1.0|
|animatic hell lot...|  1.0|       1.0|
|annual spring cle...|  1.0|       1.0|
|anyone think secu...|  1.0|       1.0|
|are looking someo...|  1.0|       1.0|
|avoid fine penalt...|  1.0|       1.0|
|bad news toe brin...|  1.0|       1.0|
|barton armed scho...|  1.0|       1.0|
|basic classificat...|  1.0|       1.0|
|bedfordshire poli...|  1.0|       1.0|
|beginning week ja...|  1.0|       1.0|
|being new company...|  1.0|       1.0|
|best part econ pa...|  1.0|       1.0|
|best photo human ...|  1.0|       1.0|
|better bulldog bu...|  1.0|       1.0|
|better throw same...|  1.0|       1.0|
|big idea disrupt ...|  1.0|       1.0|
+--------------------+-----+----------+
only showing top 20 rows



In [None]:
# TODO Add join back to original text
# TODO fix raw_classification labels
# TODO show accuracy measures