In [1]:

#create spark session
from pyspark.sql import SQLContext

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('nlp').getOrCreate()

In [2]:
from pyspark.ml.feature import HashingTF,IDF
text_df=spark.read.csv('Movie_reviews.csv',inferSchema=True,header=True,sep=',')


In [11]:
text_df.printSchema()


root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [12]:
text_df.count()

7087

In [6]:
text_df=text_df.filter(((text_df.Sentiment =='1') | (text_df.Sentiment =='0')))


In [7]:
text_df.groupBy('Sentiment').count().show()


+---------+-----+
|Sentiment|count|
+---------+-----+
|        0| 3081|
|        1| 3909|
+---------+-----+



In [8]:
text_df.printSchema()


root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [9]:
text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')


In [11]:
text_df.show(10,False)


+------------------------------------------------------------------------+-----+
|Review                                                                  |Label|
+------------------------------------------------------------------------+-----+
|The Da Vinci Code book is just awesome.                                 |1.0  |
|this was the first clive cussler i've ever read, but even books like Rel|1.0  |
|i liked the Da Vinci Code a lot.                                        |1.0  |
|i liked the Da Vinci Code a lot.                                        |1.0  |
|I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own.|1.0  |
|that's not even an exaggeration ) and at midnight we went to Wal-Mart to|1.0  |
|I loved the Da Vinci Code, but now I want something better and different|1.0  |
|i thought da vinci code was great, same with kite runner.               |1.0  |
|The Da Vinci Code is actually a good movie...                           |1.0  |
|I thought the Da Vinci Code

In [16]:
text_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  1.0| 3909|
|  0.0| 3081|
+-----+-----+



In [26]:
from pyspark.sql.functions import length
from pyspark.sql.functions import rand 

In [20]:
text_df = text_df.withColumn('length', length(text_df['Review']))

In [28]:
text_df.orderBy(rand()).show(10,False)


+------------------------------------------------------------------------+-----+------+
|Review                                                                  |Label|length|
+------------------------------------------------------------------------+-----+------+
|So Brokeback Mountain was really depressing.                            |0.0  |44    |
|yea, i saw Mission Impossible 3 too, that was awesome!!                 |1.0  |55    |
|I either LOVE Brokeback Mountain or think it's great that homosexuality |1.0  |71    |
|This quiz sucks and Harry Potter sucks ok bye..                         |0.0  |47    |
|I love the theme song of mission impossible * *.                        |1.0  |48    |
|I love Harry Potter..                                                   |1.0  |21    |
|Then snuck into Brokeback Mountain, which is the most depressing movie I|0.0  |72    |
|I love Harry Potter...                                                  |1.0  |22    |
|by the way, the Da Vinci Code s

In [25]:
text_df.groupBy('Label').agg({'Length':'mean'}).show()

+-----+-----------------+
|Label|      avg(Length)|
+-----+-----------------+
|  1.0|47.61882834484523|
|  0.0|50.95845504706264|
+-----+-----------------+



### Data Cleaning

In [59]:

from pyspark.ml.feature import Tokenizer

In [60]:

tokenization = Tokenizer(inputCol='Review', outputCol='tokens')

In [61]:
tokenized_df = tokenization.transform(text_df)

In [62]:
tokenized_df.show(15)

+--------------------+-----+------+--------------------+
|              Review|Label|length|              tokens|
+--------------------+-----+------+--------------------+
|The Da Vinci Code...|  1.0|    39|[the, da, vinci, ...|
|this was the firs...|  1.0|    72|[this, was, the, ...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|
|I liked the Da Vi...|  1.0|    72|[i, liked, the, d...|
|that's not even a...|  1.0|    72|[that's, not, eve...|
|I loved the Da Vi...|  1.0|    72|[i, loved, the, d...|
|i thought da vinc...|  1.0|    57|[i, thought, da, ...|
|The Da Vinci Code...|  1.0|    45|[the, da, vinci, ...|
|I thought the Da ...|  1.0|    51|[i, thought, the,...|
|The Da Vinci Code...|  1.0|    68|[the, da, vinci, ...|
|The Da Vinci Code...|  1.0|    62|[the, da, vinci, ...|
|then I turn on th...|  1.0|    66|[then, i, turn, o...|
|The Da Vinci Code...|  1.0|    34|[the, da, vinci, ...|
|i love da vinci c...|  1.0|   

In [63]:
from pyspark.ml.feature import StopWordsRemover


In [64]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')


In [65]:
refined_text_df = stopword_removal.transform(tokenized_df)

In [66]:
refined_text_df.show()

+--------------------+-----+------+--------------------+--------------------+
|              Review|Label|length|              tokens|      refined_tokens|
+--------------------+-----+------+--------------------+--------------------+
|The Da Vinci Code...|  1.0|    39|[the, da, vinci, ...|[da, vinci, code,...|
|this was the firs...|  1.0|    72|[this, was, the, ...|[first, clive, cu...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|[liked, da, vinci...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|[liked, da, vinci...|
|I liked the Da Vi...|  1.0|    72|[i, liked, the, d...|[liked, da, vinci...|
|that's not even a...|  1.0|    72|[that's, not, eve...|[even, exaggerati...|
|I loved the Da Vi...|  1.0|    72|[i, loved, the, d...|[loved, da, vinci...|
|i thought da vinc...|  1.0|    57|[i, thought, da, ...|[thought, da, vin...|
|The Da Vinci Code...|  1.0|    45|[the, da, vinci, ...|[da, vinci, code,...|
|I thought the Da ...|  1.0|    51|[i, thought, the,...|[thought