In [1]:
!which pip
!pwd

/global/software/jupyterhub-spark/anaconda3/bin/pip
/home/gregory.slowski


In [None]:
import os
import atexit
import sys
import time
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=1
tasks_per_node=4
memory_per_task=4096 #4 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="3:00" #1 hours
#os.environ['SBATCH_PARTITION']='cpu2019' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
time.sleep(60)
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)


INFO:sparkhpc.sparkjob:Submitted batch job 12453

INFO:sparkhpc.sparkjob:Submitted cluster 0


In [None]:
sqlContext = sqlCtx
spark = sqlCtx

In [None]:
#!pip install --upgrade pip
!pip install nltk
import nltk
nltk.download('punkt')
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf

In [None]:
def tokenize1(text):
    words = nltk.word_tokenize(text)
    return words  
tokenize_word = udf(lambda x: tokenize1(x)  , ArrayType(StringType()))

In [None]:
def tokenize2(text):
    sents = nltk.sent_tokenize(text)
    return sents  
tokenize_sent = udf(lambda x: tokenize2(x)  , ArrayType(StringType()))

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_en = stopwords.words('english')
def remove_stopwords1(word_list):
    filtered_words = [word for word in word_list if word not in stop_en]
    return filtered_words
remove_stopwords = udf(lambda x: remove_stopwords1(x) , ArrayType(StringType()))

In [None]:
def remove_noise1(word_list):
    filtered_words = [word for word in word_list if word.isalnum() and len(word)>2]
    return filtered_words
remove_noise = udf(lambda x: remove_noise1(x) , ArrayType(StringType()))

In [None]:
from nltk.stem import SnowballStemmer
def stem1(word_list):
    snowball = SnowballStemmer(language='english')
    stemmed_words = [snowball.stem(word) for word in word_list]
    return stemmed_words
stem = udf(lambda x: stem1(x) , ArrayType(StringType()))

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
def sentiment1(text):
  sia = SentimentIntensityAnalyzer()
  return sia.polarity_scores(text)['compound']
sentiment = udf(lambda x: sentiment1(x) , FloatType())

In [None]:
#area for testing functions:

mylist = ['GeeksforGeeks', ",", 'is', 'a', 'portal', 'for', 'geeks']
alphanum_words = [word for word in mylist if word.isalnum()]
N = len(alphanum_words)
punctuation_words = [word for word in mylist if not(word.isalnum())]
num_punctuation_words = len(punctuation_words)
ratio_punctuation_to_total = num_punctuation_words / N
ratio_punctuation_to_total

In [None]:
#greg
#create udf for adding length of reviews by word count for all alphanumeric "words"

def get_length1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  return N

get_length = udf(lambda x: get_length1(x), IntegerType())

In [None]:
#greg
#create udf for adding average word length by review

def get_average_word_length1(word_list):
  alphanum_word_lengths = [len(word) for word in word_list if word.isalnum()]
  avg_word_len = sum(alphanum_word_lengths)/len(alphanum_word_lengths)
  return avg_word_len

get_average_word_length = udf(lambda x: get_average_word_length1(x), FloatType())

In [None]:
#greg
#create udf for ratio of capitalized characters to total characters (n), with no white spaces

def capital_ratio1(word_list):
  s = "".join(word_list)
  n = len(s)
  cap_n = len([char for char in s if char.isupper()])
  ratio_cap_to_total = cap_n / n
  return ratio_cap_to_total

capital_ratio = udf(lambda x: capital_ratio1(x), FloatType())


In [None]:
#greg
#create udf for ratio of long words (>5 letters) to total words (N)

def long_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  long_words = [word for word in alphanum_words if len(word) > 5]
  long_words_count = len(long_words)
  ratio_long_to_total = long_words_count / N
  return ratio_long_to_total

long_word_ratio = udf(lambda x: long_word_ratio1(x), FloatType())

In [None]:
#greg
#create udf for ratio of words after stop word removal vs total words

stop_en = stopwords.words('english')

def filtered_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  non_filtered_words = [word for word in alphanum_words if word not in stop_en]
  num_non_filtered_words = len(non_filtered_words)
  ratio_non_filtered_to_total = num_non_filtered_words / N
  return ratio_non_filtered_to_total

filtered_word_ratio = udf(lambda x: filtered_word_ratio1(x), FloatType())


In [None]:
#greg
#create udf for ratio of punctuation "words" vs total alphanumeric words

def punctuation_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  punctuation_words = [word for word in word_list if not(word.isalnum())]
  num_punctuation_words = len(punctuation_words)
  ratio_punctuation_to_total = num_punctuation_words / N
  return ratio_punctuation_to_total

punctuation_word_ratio = udf(lambda x: punctuation_word_ratio1(x), FloatType())

In [None]:
from pyspark.sql.functions import lower
def process_data(df):
  dfText = df.select("Index","Review" ,"polarity","real_fake", tokenize_word("Review").alias("tokenized_words"))
  dfText = dfText.withColumn("sentiment" ,sentiment("Review"))
  dfText = dfText.withColumn("tokenized_sents" ,tokenize_sent("Review"))
  dfText = dfText.withColumn("no_stopwords", remove_stopwords("tokenized_words"))
  dfText = dfText.withColumn("no_noise", remove_noise("no_stopwords"))
  dfText = dfText.withColumn("stemmed", stem("no_noise"))
  #dfText = dfText.select('*', F.concat_ws("_","real_fake","polarity").alias("target"))
  dfText = dfText.withColumn("length_in_words", get_length("tokenized_words"))
  dfText = dfText.withColumn("average_word_length", get_average_word_length("tokenized_words"))
  dfText = dfText.withColumn("capital_char_ratio", capital_ratio("tokenized_words"))
  dfText = dfText.withColumn("long_word_ratio", long_word_ratio("tokenized_words"))
  dfText = dfText.withColumn("non_stop_word_ratio", filtered_word_ratio("tokenized_words"))
  dfText = dfText.withColumn("punctuation_ratio", punctuation_word_ratio("tokenized_words"))

  return dfText

In [None]:
df_raw_test = spark.read.option("escape","\"").option("header",True).csv("Hotel_Reviews_Calgary.csv")
df_test_all = process_data(df_raw_test)
#df_test = df_test_all.select("Review","stemmed","sentiment","target","real_fake")
df_test = df_test_all.select("Review","stemmed","sentiment","polarity", "length_in_words", "average_word_length", \
                             "capital_char_ratio", "long_word_ratio", "non_stop_word_ratio", "punctuation_ratio", "real_fake")

df_raw_train = spark.read.option("escape","\"").option("header",True).csv("Original_data.csv")
df_train = process_data(df_raw_train)
#df_train = df_train.select("Review","stemmed","sentiment","target","real_fake")
df_train = df_train.select("Review","stemmed","sentiment","polarity", "length_in_words", "average_word_length", \
                             "capital_char_ratio", "long_word_ratio", "non_stop_word_ratio", "punctuation_ratio", "real_fake")

#df_train.display()
combined = df_train.union(df_test)

In [None]:
print((df_test.count(), len(df_test.columns)))

In [None]:
df_train.cache()
df_test.cache()

In [None]:
df_train.show(1)

In [53]:
from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

train_split, test_split = df_train.randomSplit(weights = [0.80, 0.20], seed = 1)
train_split.cache()

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
lr = LogisticRegression(regParam = 0.3)
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1, assembles, label_strIdx2, lr ,label_idxStr])


In [54]:
model = pipeline.fit(train_split)

In [55]:
pred = model.transform(test_split)
pred_original = model.transform(df_test)
pred.cache()
pred_original.cache()

DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string, length_in_words_vec: vector, average_word_length_vec: vector, capital_char_ratio_vec: vector, long_word_ratio_vec: vector, non_stop_word_ratio_vec: vector, punctuation_ratio_vec: vector, length_in_words_scaled: vector, average_word_length_scaled: vector, capital_char_ratio_scaled: vector, long_word_ratio_scaled: vector, non_stop_word_ratio_scaled: vector, punctuation_ratio_scaled: vector, rawFeatures: vector, TF_IDF: vector, polarity_idx: double, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double, article_class: string]

In [56]:
pred.select("review","polarity", "label", "prediction","article_class").show()

+--------------------+--------+-----+----------+-------------+
|              review|polarity|label|prediction|article_class|
+--------------------+--------+-----+----------+-------------+
| My wife and I's ...|positive|  0.0|       0.0|         fake|
|A bunch of us got...|positive|  1.0|       0.0|         real|
|A lovely hotel in...|negative|  1.0|       1.0|         real|
|A recent stay at ...|negative|  0.0|       0.0|         fake|
|Affinia hotel in ...|negative|  0.0|       0.0|         fake|
|After considering...|negative|  0.0|       0.0|         fake|
|After reading so ...|positive|  1.0|       1.0|         real|
|After reading the...|negative|  1.0|       1.0|         real|
|After reading the...|positive|  1.0|       1.0|         real|
|After some delibe...|positive|  1.0|       1.0|         real|
|All I can say is ...|negative|  1.0|       0.0|         real|
|Amalifa Hotel in ...|positive|  0.0|       0.0|         fake|
|Awesome hotel, ou...|positive|  1.0|       1.0|       

In [57]:
pred.show()

+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+-------------------+-----------------------+----------------------+--------------------+-----------------------+---------------------+----------------------+--------------------------+-------------------------+----------------------+--------------------------+------------------------+--------------------+--------------------+------------+--------------------+-----+--------------------+--------------------+----------+-------------+
|              Review|             stemmed|sentiment|polarity|length_in_words|average_word_length|capital_char_ratio|long_word_ratio|non_stop_word_ratio|punctuation_ratio|real_fake|length_in_words_vec|average_word_length_vec|capital_char_ratio_vec| long_word_ratio_vec|non_stop_word_ratio_vec|punctuation_ratio_vec|length_in_words_scaled|average_word_length_scaled|capital_char_ratio_

In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_our_data = eval.evaluate(pred)
print("our data: ", acc_our_data)
acc_original_data = eval.evaluate(pred_original)
print("original data: ", acc_original_data)


our data:  0.8621700879765396
original data:  0.609


In [59]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(lr.regParam, [0.1, 0.3 ,0.5]) \
    .build()
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=4,parallelism = 100 )  # use 3+ folds in practice

### ON REFINEMENT DO 5 FOLD? MAYBE JUST FOR BEST PRELIMINARY MODEL?###

In [60]:
cvModel = crossval.fit(train_split)

In [61]:
p = cvModel.transform(test_split)

In [62]:
acc = eval.evaluate(p)

In [63]:
acc

0.8709677419354839

In [64]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel.getEvaluator().getMetricName(): metric,
      ** ps
   }
   for ps, metric in zip(params, cvModel.avgMetrics)
])

Unnamed: 0,f1,numFeatures,regParam
0,0.835169,10000,0.1
1,0.83913,10000,0.3
2,0.841498,10000,0.5
3,0.834417,20000,0.1
4,0.840727,20000,0.3
5,0.847895,20000,0.5
6,0.840545,50000,0.1
7,0.845282,50000,0.3
8,0.846079,50000,0.5


In [70]:
from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

train_split, test_split = df_train.randomSplit(weights = [0.80, 0.20], seed = 1)
train_split.cache()

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
svc = LinearSVC(featuresCol="features",labelCol="label")
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline_svc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, svc ,label_idxStr])

In [71]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(svc.regParam, [0.0001, 0.001, 0.01, 0.1, 1]) \
    .build()
crossval_svc = CrossValidator(estimator=pipeline_svc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [72]:
cvModel_svc = crossval_svc.fit(train_split)

In [73]:
pred_svc = cvModel_svc.transform(test_split)
pred_original_svc = cvModel_svc.transform(df_test)
pred_svc.cache()
pred_original_svc.cache()

DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string, length_in_words_vec: vector, average_word_length_vec: vector, capital_char_ratio_vec: vector, long_word_ratio_vec: vector, non_stop_word_ratio_vec: vector, punctuation_ratio_vec: vector, length_in_words_scaled: vector, average_word_length_scaled: vector, capital_char_ratio_scaled: vector, long_word_ratio_scaled: vector, non_stop_word_ratio_scaled: vector, punctuation_ratio_scaled: vector, rawFeatures: vector, TF_IDF: vector, polarity_idx: double, features: vector, label: double, rawPrediction: vector, prediction: double, article_class: string]

In [74]:
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_our_data_svc = eval.evaluate(pred_svc)
print("our data: ", acc_our_data_svc)
acc_original_data_svc = eval.evaluate(pred_original_svc)
print("original data: ", acc_original_data_svc)

our data:  0.8621700879765396
original data:  0.625


In [75]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_svc.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_svc.getEvaluator().getMetricName(): metric,
      ** ps
   }
   for ps, metric in zip(params, cvModel_svc.avgMetrics)
])

Unnamed: 0,f1,numFeatures,regParam
0,0.835169,10000,0.0001
1,0.83913,10000,0.001
2,0.841498,10000,0.01
3,0.834417,10000,0.1
4,0.840727,10000,1.0
5,0.847895,20000,0.0001
6,0.840545,20000,0.001
7,0.845282,20000,0.01
8,0.846079,20000,0.1


In [76]:
svc.params

[Param(parent='LinearSVC_19c554e505fc', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'),
 Param(parent='LinearSVC_19c554e505fc', name='featuresCol', doc='features column name.'),
 Param(parent='LinearSVC_19c554e505fc', name='fitIntercept', doc='whether to fit an intercept term.'),
 Param(parent='LinearSVC_19c554e505fc', name='labelCol', doc='label column name.'),
 Param(parent='LinearSVC_19c554e505fc', name='maxIter', doc='max number of iterations (>= 0).'),
 Param(parent='LinearSVC_19c554e505fc', name='predictionCol', doc='prediction column name.'),
 Param(parent='LinearSVC_19c554e505fc', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'),
 Param(parent='LinearSVC_19c554e505fc', name='regParam', doc='regularization parameter (>= 0).'),
 Param(parent='LinearSVC_19c554e505fc', name='standardization', doc='whether to standardize the training features before fitting the model.'),
 Param(parent='LinearSVC_19c554e505fc', name='threshold

In [26]:
#greg

from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

train_split, test_split = df_train.randomSplit(weights = [0.80, 0.20], seed = 1)
train_split.cache()

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
rfc = RandomForestClassifier(featuresCol="features",labelCol="label")
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline_rfc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, rfc ,label_idxStr])

In [27]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(rfc.maxDepth, [3, 5, 9, 15]) \
    .addGrid(rfc.numTrees, [20, 50, 100, 200]) \
    .build()
crossval_rfc = CrossValidator(estimator=pipeline_rfc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [28]:
cvModel_rfc = crossval_rfc.fit(train_split)

In [29]:
pred_rfc = cvModel_rfc.transform(test_split)
pred_original_rfc = cvModel_rfc.transform(df_test)
pred_rfc.cache()
pred_original_rfc.cache()

DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string, length_in_words_vec: vector, average_word_length_vec: vector, capital_char_ratio_vec: vector, long_word_ratio_vec: vector, non_stop_word_ratio_vec: vector, punctuation_ratio_vec: vector, length_in_words_scaled: vector, average_word_length_scaled: vector, capital_char_ratio_scaled: vector, long_word_ratio_scaled: vector, non_stop_word_ratio_scaled: vector, punctuation_ratio_scaled: vector, rawFeatures: vector, TF_IDF: vector, polarity_idx: double, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double, article_class: string]

In [37]:
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_our_data_rfc = eval.evaluate(pred_rfc)
print("our data: ", acc_our_data_rfc)
acc_original_data_rfc = eval.evaluate(pred_original_rfc)
print("original data: ", acc_original_data_rfc)

our data:  0.8123167155425219
original data:  0.588


In [40]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_rfc.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_rfc.getEvaluator().getMetricName(): metric,
      ** ps
   } for ps, metric in zip(params, cvModel_rfc.avgMetrics)
])

Unnamed: 0,f1,maxDepth,numFeatures,numTrees
0,0.579049,3,10000,5
1,0.670132,3,10000,10
2,0.702024,3,10000,20
3,0.769181,3,10000,50
4,0.610245,5,10000,5
5,0.682026,5,10000,10
6,0.73405,5,10000,20
7,0.79375,5,10000,50
8,0.648336,9,10000,5
9,0.678103,9,10000,10


In [44]:
#greg

from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

train_split, test_split = df_train.randomSplit(weights = [0.80, 0.20], seed = 1)
train_split.cache()

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
gbtc = GBTClassifier(featuresCol="features",labelCol="label")
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline_gbtc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, gbtc ,label_idxStr])

In [45]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(gbtc.maxDepth, [3, 5, 9, 15]) \
    .addGrid(gbtc.stepSize, [0.01, 0.1, 0.5, 1]) \
    .build()
crossval_gbtc = CrossValidator(estimator=pipeline_gbtc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [None]:
cvModel_gbtc = crossval_gbtc.fit(train_split)

In [None]:
pred_gbtc = cvModel_gbtc.transform(test_split)
pred_original_gbtc = cvModel_gbtc.transform(df_test)
pred_gbtc.cache()
pred_original_gbtc.cache()

In [None]:
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_our_data_gbtc = eval.evaluate(pred_gbtc)
print("our data: ", acc_our_data_gbtc)
acc_original_data_gbtc = eval.evaluate(pred_original_gbtc)
print("original data: ", acc_original_data_gbtc)

In [None]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_gbtc.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_gbtc.getEvaluator().getMetricName(): metric,
      ** ps
   }
   for ps, metric in zip(params, cvModel_gbtc.avgMetrics)
])