In [0]:
#!pip install --upgrade pip
!pip install nltk
import nltk
nltk.download('punkt')
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def tokenize1(text):
    words = nltk.word_tokenize(text)
    return words  
tokenize_word = udf(lambda x: tokenize1(x)  , ArrayType(StringType()))

In [0]:
def tokenize2(text):
    sents = nltk.sent_tokenize(text)
    return sents  
tokenize_sent = udf(lambda x: tokenize2(x)  , ArrayType(StringType()))

In [0]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_en = stopwords.words('english')
def remove_stopwords1(word_list):
    filtered_words = [word for word in word_list if word not in stop_en]
    return filtered_words
remove_stopwords = udf(lambda x: remove_stopwords1(x) , ArrayType(StringType()))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def remove_noise1(word_list):
    filtered_words = [word for word in word_list if word.isalnum() and len(word)>2]
    return filtered_words
remove_noise = udf(lambda x: remove_noise1(x) , ArrayType(StringType()))

In [0]:
from nltk.stem import SnowballStemmer
def stem1(word_list):
    snowball = SnowballStemmer(language='english')
    stemmed_words = [snowball.stem(word) for word in word_list]
    return stemmed_words
stem = udf(lambda x: stem1(x) , ArrayType(StringType()))

In [0]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
def sentiment1(text):
  sia = SentimentIntensityAnalyzer()
  return sia.polarity_scores(text)['compound']
sentiment = udf(lambda x: sentiment1(x) , FloatType())

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [0]:
#greg
#create udf for adding length of reviews by word count for all alphanumeric "words"

def get_length1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  return N

get_length = udf(lambda x: get_length1(x), IntegerType())

In [0]:
#greg
#create udf for adding average word length by review

def get_average_word_length1(word_list):
  alphanum_word_lengths = [len(word) for word in word_list if word.isalnum()]
  avg_word_len = sum(alphanum_word_lengths)/len(alphanum_word_lengths)
  return avg_word_len

get_average_word_length = udf(lambda x: get_average_word_length1(x), FloatType())

In [0]:
#greg
#create udf for ratio of capitalized characters to total characters (n), with no white spaces

def capital_ratio1(word_list):
  s = "".join(word_list)
  n = len(s)
  cap_n = len([char for char in s if char.isupper()])
  ratio_cap_to_total = cap_n / n
  return ratio_cap_to_total

capital_ratio = udf(lambda x: capital_ratio1(x), FloatType())

In [0]:
#greg
#create udf for ratio of long words (>5 letters) to total words (N)

def long_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  long_words = [word for word in alphanum_words if len(word) > 5]
  long_words_count = len(long_words)
  ratio_long_to_total = long_words_count / N
  return ratio_long_to_total

long_word_ratio = udf(lambda x: long_word_ratio1(x), FloatType())

In [0]:
#greg
#create udf for ratio of words after stop word removal vs total words

stop_en = stopwords.words('english')

def filtered_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  non_filtered_words = [word for word in alphanum_words if word not in stop_en]
  num_non_filtered_words = len(non_filtered_words)
  ratio_non_filtered_to_total = num_non_filtered_words / N
  return ratio_non_filtered_to_total

filtered_word_ratio = udf(lambda x: filtered_word_ratio1(x), FloatType())

In [0]:
#greg
#create udf for ratio of punctuation "words" vs total alphanumeric words

def punctuation_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  punctuation_words = [word for word in word_list if not(word.isalnum())]
  num_punctuation_words = len(punctuation_words)
  ratio_punctuation_to_total = num_punctuation_words / N
  return ratio_punctuation_to_total

punctuation_word_ratio = udf(lambda x: punctuation_word_ratio1(x), FloatType())

In [0]:
from pyspark.sql.functions import lower
def process_data(df):
  dfText = df.select("Index","Review" ,"polarity","real_fake", tokenize_word("Review").alias("tokenized_words"))
  dfText = dfText.withColumn("sentiment" ,sentiment("Review"))
  dfText = dfText.withColumn("tokenized_sents" ,tokenize_sent("Review"))
  dfText = dfText.withColumn("no_stopwords", remove_stopwords("tokenized_words"))
  dfText = dfText.withColumn("no_noise", remove_noise("no_stopwords"))
  dfText = dfText.withColumn("stemmed", stem("no_noise"))
  #dfText = dfText.select('*', F.concat_ws("_","real_fake","polarity").alias("target"))
  dfText = dfText.withColumn("length_in_words", get_length("tokenized_words"))
  dfText = dfText.withColumn("average_word_length", get_average_word_length("tokenized_words"))
  dfText = dfText.withColumn("capital_char_ratio", capital_ratio("tokenized_words"))
  dfText = dfText.withColumn("long_word_ratio", long_word_ratio("tokenized_words"))
  dfText = dfText.withColumn("non_stop_word_ratio", filtered_word_ratio("tokenized_words"))
  dfText = dfText.withColumn("punctuation_ratio", punctuation_word_ratio("tokenized_words"))

  return dfText

In [0]:
df_raw_old = spark.read.option("escape","\"").option("header",True).csv("/FileStore/tables/Original_data.csv")
df_old_all = process_data(df_raw_old)
#df_test = df_test_all.select("Review","stemmed","sentiment","target","real_fake")
df_old = df_old_all.select("Review","stemmed","sentiment","polarity", "length_in_words", "average_word_length", \
                             "capital_char_ratio", "long_word_ratio", "non_stop_word_ratio", "punctuation_ratio", "real_fake")

old_train_split, old_test_split = df_old.randomSplit(weights = [0.80, 0.20], seed = 1)
old_train_split.cache()
old_test_split.cache()

df_raw_new = spark.read.option("escape","\"").option("header",True).csv("/FileStore/tables/Hotel_Reviews_Calgary.csv")
df_new_all = process_data(df_raw_new)
#df_train = df_train.select("Review","stemmed","sentiment","target","real_fake")
df_new = df_new_all.select("Review","stemmed","sentiment","polarity", "length_in_words", "average_word_length", \
                             "capital_char_ratio", "long_word_ratio", "non_stop_word_ratio", "punctuation_ratio", "real_fake")

new_train_split, new_test_split = df_new.randomSplit(weights = [0.80, 0.20], seed = 1)
new_train_split.cache()
new_test_split.cache()

#df_train.display()
combined_train = old_train_split.union(new_train_split)
combined_test = old_test_split.union(new_test_split)

combined_train.cache()
combined_test.cache()

Out[17]: DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string]

In [0]:
print((combined_train.count(), len(combined_train.columns)))

(2119, 11)


In [0]:
combined_train.show(1)

+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+
|              Review|             stemmed|sentiment|polarity|length_in_words|average_word_length|capital_char_ratio|long_word_ratio|non_stop_word_ratio|punctuation_ratio|real_fake|
+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+
| Barely Average H...|[bare, averag, ho...|   0.1901|negative|            187|           4.390374|         0.0391924|     0.26737967|          0.5828877|      0.112299465|     real|
+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+
only showing top 1 row



# Logistic Regression

In [0]:
from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
lr = LogisticRegression(regParam = 0.3)
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1, assembles, label_strIdx2, lr ,label_idxStr])

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(lr.regParam, [0.1, 0.3 ,0.5]) \
    .build()
crossval_lr = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=3,parallelism = 100 )  # use 3+ folds in practice

### ON REFINEMENT DO 5 FOLD? MAYBE JUST FOR BEST PRELIMINARY MODEL?###

In [0]:
cvModel_lr = crossval_lr.fit(combined_train)

In [0]:
pred_old_lr = cvModel_lr.transform(old_test_split)
pred_new_lr = cvModel_lr.transform(new_test_split)
pred_combined_lr = cvModel_lr.transform(combined_test)

pred_old_lr.cache()
pred_new_lr.cache()
pred_combined_lr.cache()

Out[39]: DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string, length_in_words_vec: vector, average_word_length_vec: vector, capital_char_ratio_vec: vector, long_word_ratio_vec: vector, non_stop_word_ratio_vec: vector, punctuation_ratio_vec: vector, length_in_words_scaled: vector, average_word_length_scaled: vector, capital_char_ratio_scaled: vector, long_word_ratio_scaled: vector, non_stop_word_ratio_scaled: vector, punctuation_ratio_scaled: vector, rawFeatures: vector, TF_IDF: vector, polarity_idx: double, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double, article_class: string]

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_our_data = eval.evaluate(pred_new_lr)
print("our data: ", acc_our_data)
acc_original_data = eval.evaluate(pred_old_lr)
print("original data: ", acc_original_data)
acc_original_data = eval.evaluate(pred_combined_lr)
print("combined data: ", acc_original_data)

our data:  0.7597765363128491
original data:  0.8178807947019867
combined data:  0.7962577962577962


In [0]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_lr.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_lr.getEvaluator().getMetricName(): metric,
      ** ps
   }
   for ps, metric in zip(params, cvModel_lr.avgMetrics)
])

Unnamed: 0,f1,numFeatures,regParam
0,0.7756,10000,0.1
1,0.78716,10000,0.3
2,0.785208,10000,0.5
3,0.784374,20000,0.1
4,0.793844,20000,0.3
5,0.790853,20000,0.5
6,0.781026,50000,0.1
7,0.794987,50000,0.3
8,0.789899,50000,0.5


In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import IndexToString

mislabeled_lr = pred_combined_lr.select('Review', 'polarity', 'real_fake', 'prediction').filter(col("label") != col("prediction"))

i2s = IndexToString(inputCol="prediction", outputCol="prediction string", labels=["real", "fake"])

mislabeled_lr_string = i2s.transform(mislabeled_lr).drop("prediction").withColumnRenamed("real_fake","True Label").withColumnRenamed("prediction string","Predicted Label")
mislabeled_lr_string.cache()
display(mislabeled_lr_string)

Review,polarity,True Label,Predicted Label
"After considering several hotels in the area, my family and I finally decided, unfortunately, on Homewood Suites, for our long weekend vacation. Overall, the experience left much to be desired. The breakfast buffet consisted of cold scrambled eggs and greasy pastries. The linens on one of our beds seemed dirty, and when I told the hotel staff and asked them to be replaced, the woman rolled her eyes and had a very unprofessional attitude. I would not come back here ever.",negative,fake,real
"Ambassador East is an awesome hotel!!! Understand that this hotel is full of history and is a boutique hotel. My family stayed at the Ambassador for two nights in a suite. Very clean room with great space. The best part of the Ambassador is not the rooms though. The staff was extremely courteous, especially the concierge. If you stay here you have to go to the Pump Room. The restaurant is full of nostalgia, the staff is great, and the food is excellent. Great atmosphere and music.",positive,real,fake
"As I walked into the hotel I was greeted warm heartedly. They took my information from me in an efficient manner and got me to my room quickly. I was very pleased to see that my bed was laid out perfectly and that there was plenty of towels in the bathroom. Also, I was happy to see that the shampoo and soap provided was sufficient. Overall, I was very happy with my stay at this hotel.",positive,fake,real
Best hotel in an excellent location I stayed at the Sofitel Chicago Water Tower with my husband and kid as he came there for a Business purpose. My 1 year old kid was jumping on the bed and he really enjoyed the atmosphere of the room. The front desk people were too welcoming and the staffs were very friendly . The room service was also good. The room was very clean and the bathrooms are so good . I like the decor of the room. The French restaurant in Sofitel is so good and we both enjoyed the dining . I will surely prefer to stay in Sofitel on a next time travel to Chicago.,positive,fake,real
"Everything you would would expect from a top-notch hotel and exceeded by the service. The trendy decor and easy to access dining gave us a 'second-wind' pick up after a day of sight-seeing. This hotel is definitely the place that I want to come back to. By the time we left after 4 days and 3 nights, my wife and I wished we had more time to enjoy the restaurant a few more times and the delightful Swiss flare. One of the things we enjoyed the most was seeing the water from our window. The feeling of being pampered can only come from the very gracious hotel staff and this hotel is number 1 in my mind.",positive,fake,real
"For the amount of money per night that the Millennium Knickerbocker Hotel charges, one would at least expect a room with a working bathroom. The toilet wouldn't flush, the sink dripped and the towels were rough and cheap looking. When I called the front desk to ask for maintenance on the sink and toilet, the staff were indifferent. It was over an hour before anyone arrived to fix the bathroom fixtures, and when they did arrive they were unable to stop the drip in the sink. All in all, I was disappointed with my stay.",negative,fake,real
"Friendly staff, clean building and rooms. The hotel was located nearby everything we were interested in seeing. Staff was willing to help give directions and very knowledgable of city. Rooms were beautifully designed and well lit, spacious and comfortable.",positive,fake,real
"Great hotel! Went to see the Museum of Contemporary Art which was great, but this hotel almost had it beat! The rooms (and even halls) are very beautifully done. Great service too. A+",positive,fake,real
"Hi, I had very bad experience with this hotel, as the rooms had bad smell and very dim light. the bed sheets were not properly cleaned. Also the carpet was dirty though AC was working properly. But i would nor recommend this hotel to my friends as it did not satisfied me. There was a problem with hot and cold water in the bathroom. Please do not go to this hotel, it seems very nice from the pictures and all but in actual its different.",negative,fake,real
"Hotel Monaco is simply amazing. I travel quite a bit, and am use to generic hotels, with horribly unimpressive rooms, and ""5 star amenities"" which usually mean a packet of instant coffee, and room service that won't serve after 9pm. Hotel Monaco on the other hand was very impressive. The suite i stayed in was stylish and cozy. This hotel has such personality. My absolute favorite part of my stay, aside from the amazing meal i had down stairs, was the in-room spa treatment. Hotel Monaco just raised the bar on hotels for me. Generic just won't do anymore.",positive,fake,real


# SVC

In [0]:
from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
svc = LinearSVC(featuresCol="features",labelCol="label")
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline_svc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1, assembles, label_strIdx2, svc ,label_idxStr])

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(svc.regParam, [0.0001, 0.001, 0.01, 0.1, 1]) \
    .build()
crossval_svc = CrossValidator(estimator=pipeline_svc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [0]:
cvModel_svc = crossval_svc.fit(combined_train)



In [0]:
pred_old_svc = cvModel_svc.transform(old_test_split)
pred_new_svc = cvModel_svc.transform(new_test_split)
pred_combined_svc = cvModel_svc.transform(combined_test)

pred_old_svc.cache()
pred_new_svc.cache()
pred_combined_svc.cache()

In [0]:
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_new_data = eval.evaluate(pred_new_svc)
print("new data: ", acc_new_data)
acc_old_data = eval.evaluate(pred_old_svc)
print("old data: ", acc_old_data)
acc_combined_data = eval.evaluate(pred_combined_svc)
print("combined data: ", acc_combined_data)

In [0]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_svc.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_svc.getEvaluator().getMetricName(): metric,
      ** ps
   }
   for ps, metric in zip(params, cvModel_svc.avgMetrics)
])

# Random Forest Classifier

In [0]:
#greg

from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
rfc = RandomForestClassifier(featuresCol="features",labelCol="label")
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline_rfc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, rfc ,label_idxStr])

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(rfc.maxDepth, [3, 5, 9]) \
    .addGrid(rfc.numTrees, [10, 20, 50]) \
    .build()
crossval_rfc = CrossValidator(estimator=pipeline_rfc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [0]:
cvModel_rfc = crossval_rfc.fit(combined_train)



In [0]:
pred_old_rfc = cvModel_rfc.transform(old_test_split)
pred_new_rfc = cvModel_rfc.transform(new_test_split)
pred_combined_rfc = cvModel_rfc.transform(combined_test)

pred_old_rfc.cache()
pred_new_rfc.cache()
pred_combined_rfc.cache()

Out[66]: DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string, length_in_words_vec: vector, average_word_length_vec: vector, capital_char_ratio_vec: vector, long_word_ratio_vec: vector, non_stop_word_ratio_vec: vector, punctuation_ratio_vec: vector, length_in_words_scaled: vector, average_word_length_scaled: vector, capital_char_ratio_scaled: vector, long_word_ratio_scaled: vector, non_stop_word_ratio_scaled: vector, punctuation_ratio_scaled: vector, rawFeatures: vector, TF_IDF: vector, polarity_idx: double, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double, article_class: string]

In [0]:
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_new_data = eval.evaluate(pred_new_rfc)
print("new data: ", acc_new_data)
acc_old_data = eval.evaluate(pred_old_rfc)
print("old data: ", acc_old_data)
acc_combined_data = eval.evaluate(pred_combined_rfc)
print("combined data: ", acc_combined_data)

new data:  0.7988826815642458
old data:  0.5596026490066225
combined data:  0.6486486486486487


In [0]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_rfc.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_rfc.getEvaluator().getMetricName(): metric,
      ** ps
   } for ps, metric in zip(params, cvModel_rfc.avgMetrics)
])

Unnamed: 0,f1,numFeatures,maxDepth,numTrees
0,0.491892,10000,3,10
1,0.48398,10000,3,20
2,0.48631,10000,3,50
3,0.541674,10000,5,10
4,0.504835,10000,5,20
5,0.50508,10000,5,50
6,0.612534,10000,9,10
7,0.606266,10000,9,20
8,0.586217,10000,9,50
9,0.499995,20000,3,10


In [0]:
from pyspark.sql.functions import col

mislabeled_rfc = pred_combined_rfc.select('Review', 'polarity', 'label', 'prediction').filter(col("label") != col("prediction"))
mislabeled_rfc.cache()
display(mislabeled_rfc)

Review,polarity,label,prediction
"A few nights ago, I stayed at the Hotel Allegro in the Theater District of Chicago. I'd heard some amazing things about the great food, and how friendly the staff were. Upon arrival, the service at the front desk and the bellman was fantastic! They treated me as if I stayed there all the time! The rooms were cozy, with very comfortable beds that helped me sleep through my jet lag. I ordered breakfast the next morning, and had room service bring it up. The meal was brought up only a few minutes after ordering, and was delicious! I would recommend these accommodations for anyone traveling, or any families staying in Chicago, as the rooms were very reasonably priced, and is a short distance to lots of great sites. Next time I come through Chicago, I will definitely be staying here!",positive,1.0,0.0
"A friend and I stayed at the Hyatt Regency in Chicago for a weekend while visiting a mutual friend of ours for her Birthday. The hotel at first glance was very nice and charming. But, We quickly found out that was not the case. Our room reeked of smoke when we first stepped in. The bathroom was filthy, it looked as though it had not been cleaned in weeks!! There was mold around the toilet and a film over the mirror. There were also very large stains all over the carpet through the room. We immediately called the front desk to tell about our smelly, unkempt room. The women at the front desk was rude and assured us that what we were smelling ""was not smoke"" because that is not permitted in the hotel. We were also told that room was cleaned that day. After several minutes of arguing with her and a manager, we were moved to a another room which took 2 hours because there were no other clean rooms available at that time. WE had to sit in the lobby with our baggage until one was made available for us. The food we ordered from the restaurant was horrible and WAY over priced. Our waiter was rude and only stopped by our table twice the whole time we were there. All the staff at this hotel seemed unhappy and barley even acknowledged any of the guests. I will never stay here again and I would never recommend this hotel to any one.",negative,1.0,0.0
"A hotel made for royalty. Decked out to the top with luxuries you could only dream of, it is an amazing experience! It's located right in the middle of everything in Chicago, and pulls in all the best attributes of the city into one fun filled experience. The customer service was fantastic, as they were very responsive and polite to my requests and fixed any complaints without a complaint of their own. A beautiful experience that I would certainly do again!",positive,1.0,0.0
"After arriving at the Sofitel Chicago Water Tower hotel I was greeted with rudeness and snubery. My room reservations had been double booked, and instead of the suite I expected, I was given a much smaller guest room. Instead of having a concierge I had cockroaches. The dining left me wishing for fast food. The bar left a bad taste in my mouth with the cheap liquor and hot beer.",negative,1.0,0.0
"After considering several hotels in the area, my family and I finally decided, unfortunately, on Homewood Suites, for our long weekend vacation. Overall, the experience left much to be desired. The breakfast buffet consisted of cold scrambled eggs and greasy pastries. The linens on one of our beds seemed dirty, and when I told the hotel staff and asked them to be replaced, the woman rolled her eyes and had a very unprofessional attitude. I would not come back here ever.",negative,1.0,0.0
After staying at the James Chicago last weekend I can honestly say I will be going for affordability on my next trip. The James Chicago is terribly overpriced and the staff was very cold and disconnected. The chairs in the room looked like something out of Ikea. I love Chicago but this hotel is definitely not one of its better places to stay. I believe I would have been better off and happier staying at the Holiday Inn. You don't always get what you pay for.,negative,1.0,0.0
"Although the InterContinental Chicago Hotel is located on ""The Magnificent Mile,"" our entire experience as guests there was anything but ....believe me, the ""magnificence"" ends at the hotel entrance! From check-in to check-out, our impressions ranged from lackluster to horrific. First, there was an error with our reservation, and once we got to our room, we found that neither our phone nor our internet connection worked, so down I trudged to the registration desk--where I had to wait in line for at least twenty minutes--to ask for help. The first night we spent was nearly totally sleepless due to TV noise coming from the next room; we weren't sure whether it was due to excessively inconsiderate neighbors or thin walls. Then, dog tired the next morning--even though we had hung a ""Do Not Disturb"" sign on our door handle--in barged the maid all bright-eyed and bushy tailed with scrubber in hand. Following that rude awakening, I was jarred again by a blood-curdling shriek from my wife, who, upon entering the bathroom to take her shower, was greeted by an incredibly large roach! Breakfast--actually all meals of which we partook at the hotel--were served cold and were somewhat tasteless. We were routinely ignored by the concierge, who always seems to be on a personal call whenever we needed him. Trying to leave the hotel was equally as difficult as checking in. Of course, there was a gross error in our bill which caused us to be delayed in our effort to depart. Getting out of the InterContinental Chicago was, believe me, the best part of our experience as guests there.",negative,1.0,0.0
"Amalfi Hotel Chicago is just minutes from Navy Pier, Michigan Avenue, the Chicago Opera Theater, The House of Blues, and the hottest night clubs. At $200.00 per night, you can expect the very best from your staff, and your accomodations. When I arrived for my weekend stay, I was greeted by a pleasant surprise of wall art, and eclectic style. After reaching my room, I was able to relax in peace on the numerous pillows. I didn't choose to use the spa this trip, but when I come back, it will be the first item on my list! The business office was great, because I had a last minute emergency with work- thank goodness for cocktail hour. Breakfast was delicious. I have already told a friend from Naperville that the next time I'm in town, she'll have to stay at Amalfi with me, instead of commuting. She really missed out! -Atlanta Girl-",positive,1.0,0.0
"Ambassador East is an awesome hotel!!! Understand that this hotel is full of history and is a boutique hotel. My family stayed at the Ambassador for two nights in a suite. Very clean room with great space. The best part of the Ambassador is not the rooms though. The staff was extremely courteous, especially the concierge. If you stay here you have to go to the Pump Room. The restaurant is full of nostalgia, the staff is great, and the food is excellent. Great atmosphere and music.",positive,0.0,1.0
"As I walked into the hotel I was greeted warm heartedly. They took my information from me in an efficient manner and got me to my room quickly. I was very pleased to see that my bed was laid out perfectly and that there was plenty of towels in the bathroom. Also, I was happy to see that the shampoo and soap provided was sufficient. Overall, I was very happy with my stay at this hotel.",positive,1.0,0.0


In [0]:
print("Best regularization parameter: ", cvModel_rfc.bestModel.stages[6]._java_obj.getRegParam())
print("Best hashing number of features: ", cvModel_rfc.bestModel.stages[1]._java_obj.getNumFeatures())

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JError[0m                                 Traceback (most recent call last)
[0;32m<command-3521022214098944>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mprint[0m[0;34m([0m[0;34m"Best regularization parameter: "[0m[0;34m,[0m [0mcvModel_rfc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m6[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetRegParam[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mprint[0m[0;34m([0m[0;34m"Best hashing number of features: "[0m[0;34m,[0m [0mcvModel_rfc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetNumFeatures[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py

# Gradient Boosting Classifier

In [0]:
# #greg

# from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import StandardScaler
# from pyspark.ml.classification import LogisticRegression
# from pyspark.ml.classification import RandomForestClassifier
# from pyspark.ml.classification import LinearSVC
# from pyspark.ml.classification import NaiveBayes
# from pyspark.ml.classification import GBTClassifier
# from pyspark.ml.feature import VectorAssembler
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
# scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
# scaling_pipeline = Pipeline(stages=assemblers + scalers)

# hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
# idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
# label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
# assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
# label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
# gbtc = GBTClassifier(featuresCol="features",labelCol="label")
# label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

# pipeline_gbtc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, gbtc ,label_idxStr])

In [0]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# paramGrid = ParamGridBuilder() \
#     .addGrid(hashingTF.numFeatures, [10000,16384,32768]) \
#     .addGrid(gbtc.maxDepth, [3, 5, 7]) \
#     .addGrid(gbtc.stepSize, [0.1, 0.5, 1.0]) \
#     .build()
# crossval_gbtc = CrossValidator(estimator=pipeline_gbtc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [0]:
# cvModel_gbtc = crossval_gbtc.fit(combined_train)

In [0]:
# pred_old_gbtc = cvModel_gbtc.transform(old_test_split)
# pred_new_gbtc = cvModel_gbtc.transform(new_test_split)
# pred_combined_gbtc = cvModel_gbtc.transform(combined_test)

# pred_old_gbtc.cache()
# pred_new_gbtc.cache()
# pred_combined_gbtc.cache()

In [0]:
# eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
# acc_new_data = eval.evaluate(pred_new_gbtc)
# print("new data: ", acc_new_data)
# acc_old_data = eval.evaluate(pred_old_gbtc)
# print("old data: ", acc_old_data)
# acc_combined_data = eval.evaluate(pred_combined_gbtc)
# print("combined data: ", acc_combined_data)

In [0]:
# params = [{
#       p.name: v
#       for p,
#       v in m.items()
#    }
#    for m in cvModel_gbtc.getEstimatorParamMaps()
# ]
# import pandas as pd

# pd.DataFrame.from_dict([{
#       cvModel_gbtc.getEvaluator().getMetricName(): metric,
#       ** ps
#    } for ps, metric in zip(params, cvModel_gbtc.avgMetrics)
# ])

In [0]:
# print("Best regularization parameter: ", cvModel_gbtc.bestModel.stages[6]._java_obj.getRegParam())
# print("Best hashing number of features: ", cvModel_gbtc.bestModel.stages[1]._java_obj.getNumFeatures())

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-3521022214098952>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mprint[0m[0;34m([0m[0;34m"Best regularization parameter: "[0m[0;34m,[0m [0mcvModel_gbtc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m6[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetRegParam[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mprint[0m[0;34m([0m[0;34m"Best hashing number of features: "[0m[0;34m,[0m [0mcvModel_gbtc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetNumFeatures[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mNameError[0m: name 'cvModel_gbtc' is not defined