In [1]:
import findspark
findspark.init('/home/gerardo-rodriguez/spark-4.0.0-bin-hadoop3')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').config('spark.executor.memory', '4g').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/27 10:54:57 WARN Utils: Your hostname, Lanz-Lenovo, resolves to a loopback address: 127.0.1.1; using 192.168.1.145 instead (on interface wlp2s0)
25/08/27 10:54:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/27 10:54:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('IMDB_Dataset.csv', header=True, inferSchema=True, quote='"', escape='"')

                                                                                

In [4]:
df.printSchema()

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [5]:
df.show(4)

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
|Basically there's...| negative|
+--------------------+---------+
only showing top 4 rows


In [6]:
from pyspark.sql.functions import length, col, mean

In [7]:
df = df.withColumn('len_review', length(col('review')))

In [8]:
df.groupBy('sentiment').mean().show()



+---------+---------------+
|sentiment|avg(len_review)|
+---------+---------------+
| positive|     1324.79768|
| negative|     1294.06436|
+---------+---------------+



                                                                                

## token, reg_roke, stop_remove, count_vec, idf, Indexer, Assembler

In [9]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StringIndexer, StopWordsRemover, CountVectorizer, IDF
from pyspark.sql.functions import col, udf

In [10]:
reg_token = RegexTokenizer(inputCol='review', outputCol='words', pattern='\\W+')
stop_remove = StopWordsRemover(inputCol='words', outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token', outputCol='count_token')
idf = IDF(inputCol='count_token', outputCol='tk_idf')
pos_neg = StringIndexer(inputCol='sentiment', outputCol='sentiment_indexed')

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
clean = VectorAssembler(inputCols=['tk_idf', 'len_review'], outputCol='features')

# Use Classification Model

In [13]:
from pyspark.ml.classification import LogisticRegression

In [14]:
log_reg = LogisticRegression(featuresCol='features' ,labelCol='sentiment_indexed')

In [15]:
from pyspark.ml import Pipeline

In [17]:
pipeline = Pipeline(stages=[reg_token, stop_remove, count_vec, idf, pos_neg, clean, log_reg])

In [18]:
train_data, test_data = df.randomSplit([0.7, 0.3])

In [None]:
model = pipeline.fit(train_data)

In [25]:
prediction = model.transform(test_data)
prediction.select('sentiment_indexed', 'prediction').show(10)

25/08/27 10:59:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


+-----------------+----------+
|sentiment_indexed|prediction|
+-----------------+----------+
|              1.0|       0.0|
|              1.0|       1.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              1.0|       1.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              1.0|       1.0|
+-----------------+----------+
only showing top 10 rows


In [23]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [34]:
eva = BinaryClassificationEvaluator(labelCol='sentiment_indexed')

## NaiveBayes

In [27]:
from pyspark.ml.classification import NaiveBayes

In [28]:
nb = NaiveBayes(featuresCol='features', labelCol='sentiment_indexed', modelType='multinomial')

In [29]:
pipeline_nb = Pipeline(stages=[reg_token, stop_remove, count_vec, idf, pos_neg, clean, nb])

In [30]:
model_nb = pipeline_nb.fit(train_data)

25/08/27 11:04:11 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
25/08/27 11:04:15 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
                                                                                

In [33]:
prediction_nb = model_nb.transform(test_data)
prediction_nb.select('sentiment_indexed', 'prediction').show(10)

25/08/27 11:05:16 WARN DAGScheduler: Broadcasting large task binary with size 7.4 MiB


+-----------------+----------+
|sentiment_indexed|prediction|
+-----------------+----------+
|              1.0|       1.0|
|              1.0|       0.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              1.0|       1.0|
|              0.0|       0.0|
|              0.0|       0.0|
|              1.0|       0.0|
+-----------------+----------+
only showing top 10 rows


                                                                                

## What is better?

In [35]:
print(f'Precision of Logistic Regression: {eva.evaluate(prediction)}')
print(f'Precision of Naive Bayes: {eva.evaluate(prediction)}')

25/08/27 11:06:37 WARN DAGScheduler: Broadcasting large task binary with size 6.8 MiB
                                                                                

Precision of Logistic Regression: 0.9248127071645409


25/08/27 11:06:41 WARN DAGScheduler: Broadcasting large task binary with size 6.8 MiB
                                                                                

Precision of Naive Bayes: 0.9248081839603246
