In [None]:
from google.colab import drive

drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
import numpy as np
from numpy.random import uniform as u
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import *
from pyspark.sql.types import *
from zipfile import ZipFile
import itertools
import re
sc = pyspark.SparkContext('local[*]')

spark = pyspark.sql.session.SparkSession.builder.enableHiveSupport().getOrCreate()

In [None]:
##inserting data zip file
df = ZipFile("/content/gdrive/My Drive/moviereviews.zip")
df.extractall(r'output')

In [None]:
df

<zipfile.ZipFile filename='/content/gdrive/My Drive/moviereviews.zip' mode='r'>

In [None]:
#getting files from the folder
files = os.listdir('output/moviereviews')
files = [x for x in files if 'txt' in x]

In [None]:
print(files)
print(len(files))

['cv231_10425.txt', 'cv473_7367.txt', 'cv337_29181.txt', 'cv550_22211.txt', 'cv088_25274.txt', 'cv599_20988.txt', 'cv961_5578.txt', 'cv109_22599.txt', 'cv347_14722.txt', 'cv081_18241.txt', 'cv254_5870.txt', 'cv097_24970.txt', 'cv128_29627.txt', 'cv724_15265.txt', 'cv115_25396.txt', 'cv626_7907.txt', 'cv508_16006.txt', 'cv565_29403.txt', 'cv784_16077.txt', 'cv407_22637.txt', 'cv974_24303.txt', 'cv835_20531.txt', 'cv672_27988.txt', 'cv959_14611.txt', 'cv133_18065.txt', 'cv280_8267.txt', 'cv761_13769.txt', 'cv631_4782.txt', 'cv155_7308.txt', 'cv565_29572.txt', 'cv221_2695.txt', 'cv681_9692.txt', 'cv496_10530.txt', 'cv542_18980.txt', 'cv378_21982.txt', 'cv504_29243.txt', 'cv036_18385.txt', 'cv459_20319.txt', 'cv446_11353.txt', 'cv319_16459.txt', 'cv019_14482.txt', 'cv153_11607.txt', 'cv272_20313.txt', 'cv739_12179.txt', 'cv078_14730.txt', 'cv896_16071.txt', 'cv323_29633.txt', 'cv087_2145.txt', 'cv633_29837.txt', 'cv140_7963.txt', 'cv554_14678.txt', 'cv400_19220.txt', 'cv582_6678.txt', 'cv3

In [None]:
## getting positive & negative files content
positive_sentiment = sc.textFile('/content/gdrive/My Drive/pos.txt')
negative_sentiment = sc.textFile('/content/gdrive/My Drive/neg.txt')
positive = positive_sentiment.flatMap(lambda x:x.lower().split()).collect()
negative = negative_sentiment.flatMap(lambda x:x.lower().split()).collect()

In [None]:
print(positive)

['abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation', 'accolade', 'accolades', 'accommodative', 'accomodative', 'accomplish', 'accomplished', 'accomplishment', 'accomplishments', 'accurate', 'accurately', 'achievable', 'achievement', 'achievements', 'achievible', 'acumen', 'adaptable', 'adaptive', 'adequate', 'adjustable', 'admirable', 'admirably', 'admiration', 'admire', 'admirer', 'admiring', 'admiringly', 'adorable', 'adore', 'adored', 'adorer', 'adoring', 'adoringly', 'adroit', 'adroitly', 'adulate', 'adulation', 'adulatory', 'advanced', 'advantage', 'advantageous', 'advantageously', 'advantages', 'adventuresome', 'adventurous', 'advocate', 'advocated', 'advocates', 'affability', 'affable', 'affably', 'affectation', 'affection', 'affectionate', 'affinity', 'affirm', 'affirmation', 'affirmative', 'affluence', 'affluent', 'afford', 'affordable', 'affordably', 'afordable', 'agile', 'agilely', 'agility', 'agreeable', 'agreeabl

In [None]:
print(negative)



In [None]:
#sentiment analysis on movie reviews using Spark
sentiment_df = []
for i in files:
    textR = sc.textFile('output/moviereviews/' + i).map(lambda x :  re.sub('[^a-z0-9 ]', "", x.lower().strip()))
    positive_rdd = textR.flatMap(lambda x: x.split())
    negative_rdd = textR.flatMap(lambda x: x.split())
    positive_word_count = positive_rdd.filter(lambda x: x in positive).count()
    negative_word_count = negative_rdd.filter(lambda x: x in negative).count()
    if positive_word_count > negative_word_count:
      sentiment = 'positive'
    elif negative_word_count > positive_word_count:
      sentiment = 'negative'
    else:
      sentiment = 'neutral'
    sentiment_df.append((i, positive_word_count, negative_word_count, sentiment))

In [21]:
print(sentiment_df)

[('cv231_10425.txt', 49, 27, 'positive'), ('cv473_7367.txt', 9, 10, 'negative'), ('cv337_29181.txt', 23, 29, 'negative'), ('cv550_22211.txt', 37, 27, 'positive'), ('cv088_25274.txt', 15, 10, 'positive'), ('cv599_20988.txt', 30, 35, 'negative'), ('cv961_5578.txt', 10, 11, 'negative'), ('cv109_22599.txt', 15, 20, 'negative'), ('cv347_14722.txt', 7, 14, 'negative'), ('cv081_18241.txt', 9, 15, 'negative'), ('cv254_5870.txt', 19, 24, 'negative'), ('cv097_24970.txt', 25, 28, 'negative'), ('cv128_29627.txt', 19, 9, 'positive'), ('cv724_15265.txt', 19, 25, 'negative'), ('cv115_25396.txt', 57, 45, 'positive'), ('cv626_7907.txt', 30, 30, 'neutral'), ('cv508_16006.txt', 68, 50, 'positive'), ('cv565_29403.txt', 37, 12, 'positive'), ('cv784_16077.txt', 31, 28, 'positive'), ('cv407_22637.txt', 28, 28, 'neutral'), ('cv974_24303.txt', 13, 17, 'negative'), ('cv835_20531.txt', 19, 34, 'negative'), ('cv672_27988.txt', 22, 16, 'positive'), ('cv959_14611.txt', 24, 16, 'positive'), ('cv133_18065.txt', 9, 14

In [24]:
##defining schema to show required output
schema = StructType([
    StructField('Filename', StringType()),
    StructField('Positive score', IntegerType()),
    StructField('Negative score', IntegerType()),
    StructField('Final sentiment', StringType())
])

In [25]:
## generating output table
movie_review = spark.createDataFrame(sentiment_df, schema)
movie_review.show(movie_review.count(), False)

+---------------+--------------+--------------+---------------+
|Filename       |Positive score|Negative score|Final sentiment|
+---------------+--------------+--------------+---------------+
|cv231_10425.txt|49            |27            |positive       |
|cv473_7367.txt |9             |10            |negative       |
|cv337_29181.txt|23            |29            |negative       |
|cv550_22211.txt|37            |27            |positive       |
|cv088_25274.txt|15            |10            |positive       |
|cv599_20988.txt|30            |35            |negative       |
|cv961_5578.txt |10            |11            |negative       |
|cv109_22599.txt|15            |20            |negative       |
|cv347_14722.txt|7             |14            |negative       |
|cv081_18241.txt|9             |15            |negative       |
|cv254_5870.txt |19            |24            |negative       |
|cv097_24970.txt|25            |28            |negative       |
|cv128_29627.txt|19            |9       