### Create SPARK_HOME and PYLIB env var and update PATH env var

In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

### Initializing Spark

In [2]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName("SparkML")\
    .master('local[*]')\
    .enableHiveSupport()\
    .config('spark.sql.warehouse.dir', 'hdfs://bigdata:8020/user/2105B44/spark-warehouse')\
    .getOrCreate()
sc = spark.sparkContext

### Loading the dependent libraries

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.sql.functions import isnan, when, count, col, countDistinct

## Sqoop commands


### importing data using Sqoop
sqoop job \
--create importDataall \
-- import-all-tables \
--connect jdbc:mysql://172.16.0.240/insofe_b44_phd_data \
--username insofeadmin \
--P \
--warehouse-dir '/user/2105B44/B44/PHD_DATASET/' \
-m 1

In [4]:
! hdfs dfs -ls 

Found 14 items
drwx------   - 2105B44 2105B44          0 2018-11-17 11:30 .Trash
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-04 10:11 .hiveJars
drwx------   - 2105B44 2105B44          0 2018-11-17 10:55 .staging
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-17 10:54 B44
drwxr-xr-x   - 2105B44 2105B44          0 2018-09-23 16:56 Batch44
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-04 16:37 _sqoop
-rw-r--r--   3 2105B44 2105B44        263 2018-11-10 11:54 bulkload.txt
-rw-r--r--   3 2105B44 2105B44   27595240 2018-11-11 11:52 cute_dataset_final1.csv
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-04 17:27 employeesDB
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-04 16:13 insofe_empdb
drwxr-xr-x   - 2105B44 2105B44          0 2018-09-30 15:16 mapreduce-input
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-04 16:25 new_emp
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-04 12:28 pig_data
drwxr-xr-x   - 2105B44 2105B44          0 2018-10-07 12:03 spark

# Viewing DATA

In [5]:
! hdfs dfs -cat /user/2105B44/B44

cat: `/user/2105B44/B44': Is a directory


In [6]:
! hdfs dfs -cat /user/2105B44/B44/PHD_DATASET/b44_phd_train

cat: `/user/2105B44/B44/PHD_DATASET/b44_phd_train': Is a directory


In [7]:
! hdfs dfs -ls /user/2105B44/B44/PHD_DATASET/

Found 2 items
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-17 10:54 /user/2105B44/B44/PHD_DATASET/b44_phd_test
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-17 10:55 /user/2105B44/B44/PHD_DATASET/b44_phd_train


In [8]:
! hdfs dfs -cat /user/2105B44/B44/PHD_DATASET/b44_phd_train/part-m-00000 | head

3,Not sure why there are such bad reviews for this location. As far as Starbucks go, it is pretty average (not especially bad). They get points knocked off due to the small size and lack of sitting/lounging space. You can only walk up or drive thru at this location :( But the drive thru is quick enough. I've never had any issues with my order getting mixed up or attitude from the baristas. I mostly visit in evenings or afternoon and it is never busy.  I've also been during morning hours, but not super early... so I can't comment on early morning rush.Seems like this low rating is partially skewed from people who don't like Starbucks coffee in general... understood, but I wish we could just filter out those reviews.  After all, we all know what Starbucks IS and ISN'T by now. Some of us need an accurate review of the location without coffee snobbery polluting the rating :P (see there ^^^ I included the smiley so my snobby coffee brethren can't take offense)
5,This is Jersey Boys as in F

it doesnt looks as csv file

In [9]:
! hdfs dfs -ls /user/2105B44/B44/PHD_DATASET/

Found 2 items
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-17 10:54 /user/2105B44/B44/PHD_DATASET/b44_phd_test
drwxr-xr-x   - 2105B44 2105B44          0 2018-11-17 10:55 /user/2105B44/B44/PHD_DATASET/b44_phd_train


In [10]:
! hdfs dfs -ls /user/2105B44/B44/PHD_DATASET/b44_phd_train

Found 2 items
-rw-r--r--   3 2105B44 2105B44          0 2018-11-17 10:55 /user/2105B44/B44/PHD_DATASET/b44_phd_train/_SUCCESS
-rw-r--r--   3 2105B44 2105B44   71552229 2018-11-17 10:55 /user/2105B44/B44/PHD_DATASET/b44_phd_train/part-m-00000


In [11]:
! hdfs dfs -ls /user/2105B44/B44/PHD_DATASET/b44_phd_test

Found 2 items
-rw-r--r--   3 2105B44 2105B44          0 2018-11-17 10:54 /user/2105B44/B44/PHD_DATASET/b44_phd_test/_SUCCESS
-rw-r--r--   3 2105B44 2105B44   36411941 2018-11-17 10:54 /user/2105B44/B44/PHD_DATASET/b44_phd_test/part-m-00000


### Reading the data and creating a dataframe

In [12]:
df_Schema = StructType([
   StructField("Rating", IntegerType(), True),
   StructField("Review", StringType(), True)])

In [13]:
trainDF = spark.read.csv(header=False,
                         inferSchema=False,
                         schema = df_Schema,
                         path="/user/2105B44/B44/PHD_DATASET/b44_phd_train",
                         ignoreLeadingWhiteSpace = True, 
                         ignoreTrailingWhiteSpace = True,
                         nullValue =True
                          )        


In [14]:
trainDF.show(2)

+------+--------------------+
|Rating|              Review|
+------+--------------------+
|     3|Not sure why ther...|
|     5|This is Jersey Bo...|
+------+--------------------+
only showing top 2 rows



In [15]:
trainDF.head(2)

[Row(Rating=3, Review=u'Not sure why there are such bad reviews for this location. As far as Starbucks go'),
 Row(Rating=5, Review=u"This is Jersey Boys as in Frankie Valli and the 4 Seasons.  If you're looking for gym")]

## In trainDF  the review text is not complete. its not reading the complete review text. 

In [16]:
trainDF.dtypes

[('Rating', 'int'), ('Review', 'string')]

In [17]:
type(trainDF)

pyspark.sql.dataframe.DataFrame

In [18]:
trainDF.count()

100097

 ## The data is read into new dataframe trainDF2 for complete review text

In [19]:
trainDF2 = spark.read.csv(header=False,
                         inferSchema=True,
                         path="/user/2105B44/B44/PHD_DATASET/b44_phd_train",
                         ignoreLeadingWhiteSpace = False, 
                         ignoreTrailingWhiteSpace = False, 
                          nanValue = "", 
                          nullValue = ""  
                          )        


In [20]:
trainDF2.show(2)

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|                 _c1|                 _c2|                 _c3|                 _c4|                 _c5|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  3|Not sure why ther...| it is pretty ave...| but not super ea...| but I wish we co...| we all know what...|
|  5|This is Jersey Bo...|                 tan| laundry and fist...| magic or boobie ...| the beginning is...|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



### review text is split into 5 coloum

In [21]:
# checking Datatypes of coloums
trainDF2.dtypes

[('_c0', 'string'),
 ('_c1', 'string'),
 ('_c2', 'string'),
 ('_c3', 'string'),
 ('_c4', 'string'),
 ('_c5', 'string')]

In [22]:
trainDF2.count()

100097

## check for na and null values

In [23]:
# check nan values
from pyspark.sql.functions import isnan, when, count, col

trainDF2.select([count(when(isnan(c), c)).alias(c) for c in trainDF2.columns]).show()

+---+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|_c5|
+---+---+---+---+---+---+
|  0|  0|  0|  0|  0|  0|
+---+---+---+---+---+---+



### the data doesnt has any nan values

In [24]:
# check null values
trainDF2.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in trainDF2.columns]).show()

+---+---+-----+-----+-----+-----+
|_c0|_c1|  _c2|  _c3|  _c4|  _c5|
+---+---+-----+-----+-----+-----+
|  0| 30|23937|38763|50917|60350|
+---+---+-----+-----+-----+-----+



### the review text has many null value as seen in coloum _c1 to _c5

## if coloum _c1 is null that means complete review is null for that rating.

In [25]:
trainDF2.filter(trainDF2._c1.isNull()).count()

30

## There are 30 null in coloum _c1. we have to drop null only in coloum _c1 as that corresponds to null reviews

In [26]:
# droping rows having null values only in coloum _c1
trainDF2 = trainDF2.filter(trainDF2._c1.isNotNull())

In [27]:
# check null values after droping null from coloum _c1
trainDF2.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in trainDF2.columns]).show()

+---+---+-----+-----+-----+-----+
|_c0|_c1|  _c2|  _c3|  _c4|  _c5|
+---+---+-----+-----+-----+-----+
|  0|  0|23907|38733|50887|60320|
+---+---+-----+-----+-----+-----+



## 30 rows are droped 

In [28]:
trainDF2.show(3)

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|                 _c1|                 _c2|                 _c3|                 _c4|                 _c5|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  3|Not sure why ther...| it is pretty ave...| but not super ea...| but I wish we co...| we all know what...|
|  5|This is Jersey Bo...|                 tan| laundry and fist...| magic or boobie ...| the beginning is...|
|  1|"I am curious kno...|                null|                null|                null|                null|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [29]:
# check null values
trainDF2.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in trainDF2.columns]).show()

+---+---+-----+-----+-----+-----+
|_c0|_c1|  _c2|  _c3|  _c4|  _c5|
+---+---+-----+-----+-----+-----+
|  0|  0|23907|38733|50887|60320|
+---+---+-----+-----+-----+-----+



## fill null values to avoid getting empty reviews after concatinate

In [30]:
# fill "_" to null cell
trainDF3 = trainDF2.na.fill("_")

In [31]:
# check null values
trainDF3.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in trainDF3.columns]).show()

+---+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|_c5|
+---+---+---+---+---+---+
|  0|  0|  0|  0|  0|  0|
+---+---+---+---+---+---+



In [32]:
trainDF3.count()

100067

## A new dataframe is created with filling "_"  to empty cells

## We have to concatnate all 5 coloums to single colum of text

In [33]:
import pyspark
from pyspark.sql import functions as sf

# concatinating _c1 and _c2 to new coloum reviews

trainDF3 = trainDF3.withColumn('reviews', 
                    sf.concat(sf.col('_c1'),sf.lit('_'), sf.col('_c2')))
trainDF3.head(1)

[Row(_c0=u'3', _c1=u'Not sure why there are such bad reviews for this location. As far as Starbucks go', _c2=u" it is pretty average (not especially bad). They get points knocked off due to the small size and lack of sitting/lounging space. You can only walk up or drive thru at this location :( But the drive thru is quick enough. I've never had any issues with my order getting mixed up or attitude from the baristas. I mostly visit in evenings or afternoon and it is never busy.  I've also been during morning hours", _c3=u" but not super early... so I can't comment on early morning rush.Seems like this low rating is partially skewed from people who don't like Starbucks coffee in general... understood", _c4=u' but I wish we could just filter out those reviews.  After all', _c5=u" we all know what Starbucks IS and ISN'T by now. Some of us need an accurate review of the location without coffee snobbery polluting the rating :P (see there ^^^ I included the smiley so my snobby coffee brethren

In [34]:
# concanitaing all coloums 5 coloums to reviews coloum

trainDF3 = trainDF3.withColumn('reviews', 
                    sf.concat(sf.col('reviews'),sf.lit('_'), sf.col('_c3')))

trainDF3 = trainDF3.withColumn('reviews', 
                    sf.concat(sf.col('reviews'),sf.lit('_'), sf.col('_c4')))

trainDF3 = trainDF3.withColumn('reviews', 
                    sf.concat(sf.col('reviews'),sf.lit('_'), sf.col('_c5')))



In [35]:
trainDF3.head(1)

[Row(_c0=u'3', _c1=u'Not sure why there are such bad reviews for this location. As far as Starbucks go', _c2=u" it is pretty average (not especially bad). They get points knocked off due to the small size and lack of sitting/lounging space. You can only walk up or drive thru at this location :( But the drive thru is quick enough. I've never had any issues with my order getting mixed up or attitude from the baristas. I mostly visit in evenings or afternoon and it is never busy.  I've also been during morning hours", _c3=u" but not super early... so I can't comment on early morning rush.Seems like this low rating is partially skewed from people who don't like Starbucks coffee in general... understood", _c4=u' but I wish we could just filter out those reviews.  After all', _c5=u" we all know what Starbucks IS and ISN'T by now. Some of us need an accurate review of the location without coffee snobbery polluting the rating :P (see there ^^^ I included the smiley so my snobby coffee brethren

In [36]:
trainDF3.show(10)

+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|                 _c1|                 _c2|                 _c3|                 _c4|                 _c5|             reviews|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  3|Not sure why ther...| it is pretty ave...| but not super ea...| but I wish we co...| we all know what...|Not sure why ther...|
|  5|This is Jersey Bo...|                 tan| laundry and fist...| magic or boobie ...| the beginning is...|This is Jersey Bo...|
|  1|"I am curious kno...|                   _|                   _|                   _|                   _|"I am curious kno...|
|  3|Wynn oh how I wan...|           hip decor|        mosaic tiles| and high fashion...| but what can you...|Wynn oh how I wan...|
|  2|I took my kid in ...| she has VERY CUR...| Sasha complained...|        

In [37]:
# drop 5 coloums '_c1', '_c2', '_c3', '_c4', '_c5'
trainDF3 = trainDF3.drop('_c1', '_c2', '_c3', '_c4', '_c5')

In [38]:
trainDF3.head(1)

[Row(_c0=u'3', reviews=u"Not sure why there are such bad reviews for this location. As far as Starbucks go_ it is pretty average (not especially bad). They get points knocked off due to the small size and lack of sitting/lounging space. You can only walk up or drive thru at this location :( But the drive thru is quick enough. I've never had any issues with my order getting mixed up or attitude from the baristas. I mostly visit in evenings or afternoon and it is never busy.  I've also been during morning hours_ but not super early... so I can't comment on early morning rush.Seems like this low rating is partially skewed from people who don't like Starbucks coffee in general... understood_ but I wish we could just filter out those reviews.  After all_ we all know what Starbucks IS and ISN'T by now. Some of us need an accurate review of the location without coffee snobbery polluting the rating :P (see there ^^^ I included the smiley so my snobby coffee brethren can't take offense)")]

In [39]:
trainDF3.show(10)

+---+--------------------+
|_c0|             reviews|
+---+--------------------+
|  3|Not sure why ther...|
|  5|This is Jersey Bo...|
|  1|"I am curious kno...|
|  3|Wynn oh how I wan...|
|  2|I took my kid in ...|
|  5|There is not a si...|
|  2|Not that authenti...|
|  3|So_ the BF and I ...|
|  1|"Really, really p...|
|  4|This is little mo...|
+---+--------------------+
only showing top 10 rows



In [40]:
trainDF3.dtypes

[('_c0', 'string'), ('reviews', 'string')]

In [41]:
trainDF3.show(2)

+---+--------------------+
|_c0|             reviews|
+---+--------------------+
|  3|Not sure why ther...|
|  5|This is Jersey Bo...|
+---+--------------------+
only showing top 2 rows



In [42]:
# importing packages
from pyspark.ml.feature import HashingTF, IDF, Tokenizer,RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer

In [43]:
# the review text is tokenized
reg_tokenizer = RegexTokenizer(inputCol="reviews", outputCol="words", pattern="[^a-zA-Z]")

In [44]:
# stop words are removed
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [45]:
# bag of words count is created
count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [46]:
label_stringIdx = StringIndexer(inputCol = "_c0", outputCol = "label")


In [47]:
pipeline = Pipeline(stages=[label_stringIdx,reg_tokenizer, stop_words_remover, count_vectors])

In [48]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(trainDF3)
dataset = pipelineFit.transform(trainDF3)
dataset.show(5)

+---+--------------------+-----+--------------------+--------------------+--------------------+
|_c0|             reviews|label|               words|            filtered|            features|
+---+--------------------+-----+--------------------+--------------------+--------------------+
|  3|Not sure why ther...|  2.0|[not, sure, why, ...|[sure, bad, revie...|(10000,[3,5,9,12,...|
|  5|This is Jersey Bo...|  4.0|[this, is, jersey...|[jersey, boys, fr...|(10000,[2,3,4,9,1...|
|  1|"I am curious kno...|  0.0|[i, am, curious, ...|[curious, know, m...|(10000,[25,29,42,...|
|  3|Wynn oh how I wan...|  2.0|[wynn, oh, how, i...|[wynn, oh, want, ...|(10000,[2,8,11,12...|
|  2|I took my kid in ...|  1.0|[i, took, my, kid...|[took, kid, wash,...|(10000,[11,58,72,...|
+---+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [49]:
dataset.head(1)

[Row(_c0=u'3', reviews=u"Not sure why there are such bad reviews for this location. As far as Starbucks go_ it is pretty average (not especially bad). They get points knocked off due to the small size and lack of sitting/lounging space. You can only walk up or drive thru at this location :( But the drive thru is quick enough. I've never had any issues with my order getting mixed up or attitude from the baristas. I mostly visit in evenings or afternoon and it is never busy.  I've also been during morning hours_ but not super early... so I can't comment on early morning rush.Seems like this low rating is partially skewed from people who don't like Starbucks coffee in general... understood_ but I wish we could just filter out those reviews.  After all_ we all know what Starbucks IS and ISN'T by now. Some of us need an accurate review of the location without coffee snobbery polluting the rating :P (see there ^^^ I included the smiley so my snobby coffee brethren can't take offense)", label

In [50]:
type(dataset)

pyspark.sql.dataframe.DataFrame

In [51]:
dataset.dtypes

[('_c0', 'string'),
 ('reviews', 'string'),
 ('label', 'double'),
 ('words', 'array<string>'),
 ('filtered', 'array<string>'),
 ('features', 'vector')]

# importing test data

In [52]:
testdf_Schema = StructType([
   StructField("rating", StringType(), True),
   StructField("reviews", StringType(), True)])
   

In [53]:
testDF = spark.read.csv(header=False,
                         inferSchema=False,
                         schema = testdf_Schema,
                         path="/user/2105B44/B44/PHD_DATASET/b44_phd_test",
                         ignoreLeadingWhiteSpace = True, 
                         ignoreTrailingWhiteSpace = True,
                         nullValue =True
                          )        


In [54]:
testDF.show(3)

+------+--------------------+
|rating|             reviews|
+------+--------------------+
|     1|"I got 'new' tire...|
|     1|Don't waste your ...|
|     1|All I can say is ...|
+------+--------------------+
only showing top 3 rows



In [55]:
testDF.dtypes

[('rating', 'string'), ('reviews', 'string')]

In [56]:
# check nan values
from pyspark.sql.functions import isnan, when, count, col

testDF.select([count(when(isnan(c), c)).alias(c) for c in testDF.columns]).show()


+------+-------+
|rating|reviews|
+------+-------+
|     0|      0|
+------+-------+



In [57]:

# check null values
testDF.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in testDF.columns]).show()



+------+-------+
|rating|reviews|
+------+-------+
|     0|     12|
+------+-------+



In [58]:
# droping rows having null values only in coloum reviews
testDF = testDF.filter(testDF.reviews.isNotNull())

In [59]:

# check null values
testDF.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in testDF.columns]).show()



+------+-------+
|rating|reviews|
+------+-------+
|     0|      0|
+------+-------+



In [60]:
# importing packages
from pyspark.ml.feature import HashingTF, IDF, Tokenizer,RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# the review text is tokenized
reg_tokenizer = RegexTokenizer(inputCol="reviews", outputCol="words", pattern="[^a-zA-Z]")

# stop words are removed
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# bag of words count is created
count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "rating", outputCol = "label")

pipeline_test = Pipeline(stages=[label_stringIdx,reg_tokenizer, stop_words_remover, count_vectors])

# Fit the pipeline to test data.
pipelineFit_test = pipeline_test.fit(testDF)
test_dataset = pipelineFit_test.transform(testDF)
test_dataset.show(5)

+------+--------------------+-----+--------------------+--------------------+--------------------+
|rating|             reviews|label|               words|            filtered|            features|
+------+--------------------+-----+--------------------+--------------------+--------------------+
|     1|"I got 'new' tire...|  2.0|[i, got, new, tir...|[got, new, tires,...|(10000,[7,8,10,11...|
|     1|Don't waste your ...|  2.0|[don, t, waste, y...|[waste, time, two...|(10000,[5,7,16,25...|
|     1|All I can say is ...|  2.0|[all, i, can, say...|[say, worst, peop...|(10000,[0,25,47,6...|
|     1|I have been to th...|  2.0|[i, have, been, t...|[restaurant, twic...|(10000,[0,1,7,10,...|
|     1|Food was NOT GOOD...|  2.0|[food, was, not, ...|[food, good, husb...|(10000,[1,2,3,7,8...|
+------+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



# Building Model

In [61]:
# import package
from pyspark.ml.classification import LogisticRegression

## logistic model

In [62]:
# building logestic model
lr = LogisticRegression(featuresCol="features", labelCol='label')

In [63]:
# fitting model to test data
lrModel = lr.fit(test_dataset)


In [64]:
# predicting 
predictions = lrModel.transform(test_dataset)

In [65]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(testDF.count())
accuracy

0.778350927703135

In [66]:
print("the prediction accuracy is ",accuracy*100)

('the prediction accuracy is ', 77.8350927703135)


# ('the prediction accuracy is ', 77.8350927703135)

# clustering

In [67]:
%matplotlib inline
from copy import deepcopy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

In [68]:
data_km = test_dataset

In [70]:
data_km.show(2)

+------+--------------------+-----+--------------------+--------------------+--------------------+
|rating|             reviews|label|               words|            filtered|            features|
+------+--------------------+-----+--------------------+--------------------+--------------------+
|     1|"I got 'new' tire...|  2.0|[i, got, new, tir...|[got, new, tires,...|(10000,[7,8,10,11...|
|     1|Don't waste your ...|  2.0|[don, t, waste, y...|[waste, time, two...|(10000,[5,7,16,25...|
+------+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 2 rows

