## Import Packages

In [95]:
#spark sql imports
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#create spark
spark = SparkSession.builder.appName('RedditData').config("spark.jars.packages").enableHiveSupport().getOrCreate()

In [3]:
#connect to dataset

df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv("hdfs:///user/yizhou/data/group_project/data_cleaned1.csv",inferSchema=True, header=True )

In [4]:
df_nlp = df.select('subreddit', 'clean_comment', 'ups', 'downs', 'score').dropna().drop_duplicates()

In [19]:
rc_t = df_nlp.sample(False, .1)
rc_t.write.format('json').save('hdfs:///user/yizhou/data/group_project/rc_t')

In [20]:
rc_t = spark.read.format('json').load('hdfs:///user/yizhou/data/group_project/rc_t/*')
rc_t.cache()
print(rc_t.count())

3740


In [6]:
df_nlp.show(10)

+-------------+--------------------+---+-----+-----+
|    subreddit|       clean_comment|ups|downs|score|
+-------------+--------------------+---+-----+-----+
|    AskReddit|definitely not sa...|  2|    0|    2|
|          nba|most pathetic soc...|217|    0|  217|
|         news|  ill take that bet | 46|    0|   46|
|          nfl|domo arigoto mari...|  3|    0|    3|
|          nba|    lol tony parker |  1|    0|    1|
|    AskReddit|knew group people...|  1|    0|    1|
|    AskReddit|              latex |  2|    0|    2|
|todayilearned|issue people beli...|  3|    0|    3|
|    AskReddit|then how come pos...|  1|    0|    1|
|    AskReddit|sitting the floor...|  2|    0|    2|
+-------------+--------------------+---+-----+-----+
only showing top 10 rows



In [96]:
## Exploring TF-IDF features

In [101]:
from collections import Counter
import string
import nltk

def term_freq_mapper(comment):
    body = comment['clean_comment']
    #tokens = nltk.tokenize.word_tokenize(body.lower())
    tokens = [word.strip(string.punctuation) for word in body.lower().split()]
    counter = Counter(tokens)
    return (comment['subreddit'], counter)

term_freq = rc_t.rdd \
    .map(term_freq_mapper) \
    .reduceByKey(lambda a,b: a+b)
term_freq.cache()

PythonRDD[1718] at RDD at PythonRDD.scala:53

In [102]:
sub_term_freq_res_0 = term_freq.take(1)[0]
sub_0 = sub_term_freq_res_0[0]
term_freq_res_0 = sub_term_freq_res_0[1]
print(sub_0)
print(sorted(list(term_freq_res_0.items()), key=lambda t_f:t_f[1], reverse=True)[0:50])

todayilearned
[('the', 151), ('that', 75), ('you', 65), ('and', 56), ('they', 33), ('not', 31), ('was', 30), ('this', 28), ('just', 28), ('for', 25), ('all', 24), ('have', 23), ('but', 23), ('like', 21), ('people', 20), ('there', 18), ('about', 18), ('are', 18), ('what', 16), ('with', 16), ('can', 15), ('because', 15), ('would', 15), ('one', 13), ('has', 13), ('those', 12), ('think', 12), ('some', 12), ('them', 11), ('get', 11), ('more', 11), ('how', 11), ('from', 10), ('had', 10), ('his', 10), ('your', 9), ('than', 9), ('even', 9), ('when', 9), ('why', 9), ('over', 9), ('which', 9), ('she', 9), ('sure', 9), ('any', 8), ('good', 8), ('yeah', 8), ('who', 8), ('don', 8), ('make', 8)]


In [103]:
# document frequency
doc_freq = term_freq \
    .flatMap(lambda sub_counter: list(sub_counter[1])) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda word_docfreq: word_docfreq[1], False)
doc_freq.cache()

PythonRDD[1730] at RDD at PythonRDD.scala:53

In [104]:
print(doc_freq.collect()[0:50])

[('and', 21), ('the', 19), ('you', 18), ('that', 17), ('way', 17), ('are', 17), ('for', 17), ('not', 17), ('but', 17), ('really', 16), ('was', 15), ('they', 15), ('have', 15), ('from', 15), ('good', 15), ('with', 15), ('when', 15), ('there', 14), ('what', 14), ('like', 14), ('all', 14), ('don', 14), ('people', 14), ('this', 14), ('being', 14), ('even', 13), ('over', 13), ('well', 13), ('right', 13), ('very', 13), ('just', 13), ('who', 13), ('then', 13), ('other', 13), ('because', 13), ('said', 13), ('never', 13), ('about', 13), ('more', 13), ('your', 13), ('against', 13), ('best', 13), ('first', 12), ('into', 12), ('which', 12), ('any', 12), ('their', 12), ('those', 12), ('know', 12), ('can', 12)]


In [105]:
#number of document
num_docs = term_freq.count()
print(num_docs)

48


In [106]:
#Inverse Document Frequency
import math

inv_doc_freq = doc_freq \
    .map(lambda t_df: (t_df[0], math.log(num_docs / t_df[1]))) \
    .sortBy(lambda t_idf: t_idf[1], True)
inv_doc_freq.cache()

PythonRDD[1738] at RDD at PythonRDD.scala:53

In [107]:
inv_doc_freq_res = inv_doc_freq.collect();
print(inv_doc_freq_res[0:50])

[('and', 0.8266785731844679), ('the', 0.9267620317414504), ('you', 0.9808292530117262), ('that', 1.037987666851675), ('way', 1.037987666851675), ('are', 1.037987666851675), ('for', 1.037987666851675), ('not', 1.037987666851675), ('but', 1.037987666851675), ('really', 1.0986122886681098), ('was', 1.1631508098056809), ('they', 1.1631508098056809), ('have', 1.1631508098056809), ('from', 1.1631508098056809), ('good', 1.1631508098056809), ('with', 1.1631508098056809), ('when', 1.1631508098056809), ('there', 1.2321436812926323), ('what', 1.2321436812926323), ('like', 1.2321436812926323), ('all', 1.2321436812926323), ('don', 1.2321436812926323), ('people', 1.2321436812926323), ('this', 1.2321436812926323), ('being', 1.2321436812926323), ('even', 1.3062516534463542), ('over', 1.3062516534463542), ('well', 1.3062516534463542), ('right', 1.3062516534463542), ('very', 1.3062516534463542), ('just', 1.3062516534463542), ('who', 1.3062516534463542), ('then', 1.3062516534463542), ('other', 1.30625165

In [108]:
sub_1 = 'nba'
term_freq_res_1 = term_freq.sortByKey().lookup(sub_1)[0]
print(sub_1)
print(sorted(list(term_freq_res_1.items()), key=lambda t_f:t_f[1], reverse=True)[0:50])

nba
[('the', 216), ('and', 80), ('that', 74), ('you', 48), ('this', 46), ('game', 40), ('was', 39), ('for', 39), ('just', 32), ('like', 31), ('but', 30), ('not', 27), ('his', 27), ('have', 25), ('can', 25), ('they', 24), ('him', 23), ('with', 22), ('are', 22), ('when', 21), ('team', 18), ('good', 16), ('think', 15), ('some', 15), ('lebron', 15), ('would', 15), ('what', 14), ('curry', 14), ('has', 14), ('don', 14), ('out', 14), ('year', 13), ('one', 13), ('been', 13), ('back', 13), ('better', 13), ('only', 13), ('see', 12), ('them', 12), ('how', 12), ('more', 12), ('could', 11), ('harden', 11), ('get', 11), ('from', 11), ('play', 11), ('lol', 10), ('got', 10), ('really', 10), ('their', 10)]


In [120]:
sub_2 = 'funny'
term_freq_res_2 = term_freq.sortByKey().lookup(sub_2)[0]
print(sub_2)
print(sorted(list(term_freq_res_2.items()), key=lambda t_f:t_f[1], reverse=True)[0:50])

funny
[('the', 150), ('you', 76), ('and', 69), ('that', 67), ('this', 38), ('but', 33), ('are', 32), ('just', 31), ('for', 29), ('was', 27), ('like', 25), ('have', 25), ('not', 22), ('can', 20), ('all', 20), ('about', 18), ('what', 18), ('don', 17), ('people', 17), ('with', 17), ('think', 16), ('your', 15), ('from', 15), ('them', 15), ('really', 14), ('they', 14), ('would', 13), ('see', 13), ('didn', 12), ('get', 11), ('his', 10), ('same', 10), ('way', 10), ('when', 10), ('had', 10), ('who', 10), ('out', 10), ('funny', 9), ('actually', 9), ('has', 9), ('there', 9), ('know', 8), ('why', 8), ('how', 8), ('stuff', 8), ('here', 8), ('some', 8), ('thing', 8), ('their', 8), ('did', 7)]


In [114]:
inv_doc_freq_map_res = inv_doc_freq.collectAsMap()
inv_doc_freq_map_res

{'and': 0.8266785731844679,
 'the': 0.9267620317414504,
 'you': 0.9808292530117262,
 'that': 1.037987666851675,
 'way': 1.037987666851675,
 'are': 1.037987666851675,
 'for': 1.037987666851675,
 'not': 1.037987666851675,
 'but': 1.037987666851675,
 'really': 1.0986122886681098,
 'was': 1.1631508098056809,
 'they': 1.1631508098056809,
 'have': 1.1631508098056809,
 'from': 1.1631508098056809,
 'good': 1.1631508098056809,
 'with': 1.1631508098056809,
 'when': 1.1631508098056809,
 'there': 1.2321436812926323,
 'what': 1.2321436812926323,
 'like': 1.2321436812926323,
 'all': 1.2321436812926323,
 'don': 1.2321436812926323,
 'people': 1.2321436812926323,
 'this': 1.2321436812926323,
 'being': 1.2321436812926323,
 'even': 1.3062516534463542,
 'over': 1.3062516534463542,
 'well': 1.3062516534463542,
 'right': 1.3062516534463542,
 'very': 1.3062516534463542,
 'just': 1.3062516534463542,
 'who': 1.3062516534463542,
 'then': 1.3062516534463542,
 'other': 1.3062516534463542,
 'because': 1.3062516534

## NLP Pipeline

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, Tokenizer, CountVectorizer, StopWordsRemover, IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [44]:
#tokenization
regexTokenizer = RegexTokenizer().setInputCol("clean_comment").setOutputCol("comment_tokenized")

#remove stop words
#StopWordsRemover.loadDefaultStopWords("english")
remover = StopWordsRemover().setInputCol("comment_tokenized").setOutputCol("filtered")

#TF-IDF
countVector = CountVectorizer(inputCol="filtered", outputCol="features")
#hashingTF = HashingTF().setInputCol("filtered").setOutputCol("features").setNumFeatures(100)

#label indexer
indexer = StringIndexer(inputCol = "score", outputCol = "label").setHandleInvalid("keep")

#build pipeline
pipeline = Pipeline(stages=[regexTokenizer, remover, countVector, indexer])
df_new = pipeline.fit(df_nlp).transform(df_nlp)

In [52]:
df_sub = pipeline.fit(rc_t).transform(rc_t)
df_sub = df_sub.drop('comment_tokenized','filtered')

In [53]:
train_sub, test_sub = df_sub.randomSplit([0.8, 0.2])

In [47]:
train_sub.count(), test_sub.count()

(2965, 775)

In [48]:
df_new = df_new.drop('comment_tokenized','filtered')
df_new.show(10)

+---------------+--------------------+---+-----+-----+--------------------+-----+
|      subreddit|       clean_comment|ups|downs|score|            features|label|
+---------------+--------------------+---+-----+-----+--------------------+-----+
|          funny|kids don think li...|  1|    0|    1|(32868,[0,4,145],...|  0.0|
|         videos|most our politici...|  1|    0|    1|(32868,[0,78,3587...|  0.0|
|      AskReddit|yknow friends pla...|  2|    0|    2|(32868,[11,12,39,...|  1.0|
|      AskReddit|read that many ti...|  6|    0|    6|(32868,[72,121,12...|  7.0|
|           news|those are only tw...|  3|    0|    3|(32868,[73,411,19...|  2.0|
|      AskReddit|silly person seed...|  5|    0|    5|(32868,[64,90,141...|  5.0|
|leagueoflegends|foxdrop great but...|  1|    0|    1|(32868,[0,2,6,10,...|  0.0|
|           pics|that because terr...|  3|    0|    3|(32868,[417,1418]...|  2.0|
|      AskReddit|last time had one...|  3|    0|    3|(32868,[3,6,70,64...|  2.0|
|            nba

In [49]:
df_new.dtypes

[('subreddit', 'string'),
 ('clean_comment', 'string'),
 ('ups', 'string'),
 ('downs', 'string'),
 ('score', 'string'),
 ('features', 'vector'),
 ('label', 'double')]

In [50]:
train_df, test_df = df_new.randomSplit([0.8, 0.2])

In [51]:
train_df.count(), test_df.count()

(30685, 7689)

## Logistic Regression

In [55]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol="features", labelCol="label")

# Fit the model
lrModel = lr.fit(train_sub)

# Print the coefficients and intercept for logistic regression
#print("Coefficients: " + str(lrModel.coefficients))
#print("Intercept: " + str(lrModel.intercept))

In [35]:
# trainingSummary = lrModel.summary
# trainingSummary.roc.show()
# print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

In [None]:
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

In [69]:
lr_pred = lrModel.transform(test_sub)
# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred,target_names=my_tags))

In [57]:
lr_pred.printSchema()

root
 |-- clean_comment: string (nullable = true)
 |-- downs: string (nullable = true)
 |-- score: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- ups: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [70]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

print("accuracy:",evaluator.evaluate(lr_pred, {evaluator.metricName: "accuracy"}))
print("f1:",evaluator.evaluate(lr_pred, {evaluator.metricName: "f1"}))

accuracy: 0.398110661268556
f1: 0.2267232530390425


In [75]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_label = evaluator.evaluate(lr_pred)
print("Root Mean Squared Error (RMSE) on test data for 'score' = %g" % (rmse_label))

Root Mean Squared Error (RMSE) on test data for 'score' = 28.3704


## Linear Regression

In [79]:
from pyspark.ml.regression import LinearRegression

lnr = LinearRegression(featuresCol="features", labelCol="label").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

lnrModel = lnr.fit(train_sub)

In [72]:
lnr_pred = lnrModel.transform(test_sub)

In [77]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_label = evaluator.evaluate(lnr_pred)
print("Root Mean Squared Error (RMSE) on test data for 'score' = %g" % (rmse_label))

Root Mean Squared Error (RMSE) on test data for 'score' = 27.87


## Decision Tree

In [81]:
from pyspark.ml.regression import DecisionTreeRegressor

tree = DecisionTreeRegressor().setLabelCol("label").setFeaturesCol("features")

treeModel = tree.fit(train_sub)

In [82]:
tree_pred = treeModel.transform(test_sub)

In [83]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_label = evaluator.evaluate(tree_pred)
print("Root Mean Squared Error (RMSE) on test data for 'score' = %g" % (rmse_label))

Root Mean Squared Error (RMSE) on test data for 'score' = 26.0653


## Gradient-boosted Regression Tree

In [84]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor().setLabelCol("label").setFeaturesCol("features").setMaxIter(10)

gbtModel = gbt.fit(train_sub)

In [88]:
gbt_pred = gbtModel.transform(test_sub)

In [89]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_label = evaluator.evaluate(gbt_pred)
print("Root Mean Squared Error (RMSE) on test data for 'score' = %g" % (rmse_label))

Root Mean Squared Error (RMSE) on test data for 'score' = 26.0882


## Random Forest

In [87]:
from pyspark.ml.regression import RandomForestRegressor

forest = RandomForestRegressor().setLabelCol("label").setFeaturesCol("features")

forestModel = forest.fit(train_sub)

In [90]:
forest_pred = forestModel.transform(test_sub)

In [91]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_label = evaluator.evaluate(forest_pred)
print("Root Mean Squared Error (RMSE) on test data for 'score' = %g" % (rmse_label))

Root Mean Squared Error (RMSE) on test data for 'score' = 26.7029
