In [None]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [None]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [None]:
import re
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType

file_path = "file:///home/talentum/project/train_trim.txt"

unstSchema = StructType([
    StructField('Label', StringType(), True),
    StructField('Review', StringType(), True)
])

baseRDD = sc.textFile(file_path)

# SAFE split + filter
rdd = (
    baseRDD
    .map(lambda x: x.strip())
    .filter(lambda x: x != "")
    .map(lambda x: re.split(r" ", x, maxsplit=1))
    .filter(lambda x: len(x) == 2)
    .map(lambda x: Row(x[0].strip(), x[1].strip()))
)

df1 = spark.createDataFrame(rdd, schema=unstSchema)

df1.printSchema()
df1.show(8, truncate=False)



root
 |-- Label: string (nullable = true)
 |-- Review: string (nullable = true)

+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Label     |Review                                                              

In [None]:
stop_words = ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now']

In [None]:
from pyspark.sql.functions import (
    col, lower, split, array_except, concat_ws, array, lit
)

# convert python list to spark array column
stop_words_col = array(*[lit(w) for w in stop_words])

df_clean = df1.withColumn(
    "Review",
    concat_ws(
        " ",
        array_except(
            split(lower(col("Review")), " "),
            stop_words_col
        )
    )
)

df_clean.show(8, truncate=False)



+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Label     |Review                                                                                                                                                                                                                                                                                                                                                                                                                            

In [None]:
from pyspark.sql.functions import regexp_replace

df_new = df_clean.withColumn(
    "Review",
    regexp_replace(col("Review"), r"[^a-z\s]", "")
)
df_new.show(8, truncate=False)

+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Label     |Review                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [None]:
# from pyspark.sql.functions import trim
# df_trim = df_new.withColumn("Review", trim(col("Review")))
# df_trim.show(1)

In [None]:
from pyspark.sql.functions import col, when

df_transform = df_new.withColumn(
    "Rating",
    when(col("Label") == "__label__2", 4)
    .when(col("Label") == "__label__1", 2)
    .otherwise(3)
)

df_transform.show(8, truncate=False)



+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|Label     |Review                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [None]:
df_processed_txt=df_transform.drop("Label")
df_processed_txt.show(8)

+--------------------+------+
|              Review|Rating|
+--------------------+------+
|stuning even nong...|     4|
|best soundtrack e...|     4|
|amazing soundtrac...|     4|
|excellent soundtr...|     4|
|remember pull jaw...|     4|
|absolute masterpi...|     4|
|buyer beware self...|     2|
|glorious story lo...|     4|
+--------------------+------+
only showing top 8 rows



In [None]:
df_csv=spark.read.csv("file:///home/talentum/project/structured.csv",header=True,inferSchema=True)
df_csv.show(8)

+----+--------------------+
|Rate|             Summary|
+----+--------------------+
|   5|great cooler exce...|
|   5|best budget 2 fit...|
|   3|the quality is go...|
|   1|very bad product ...|
|   3|       ok ok product|
|   5|the cooler is rea...|
|   5|   very good product|
|   3|           very nice|
+----+--------------------+
only showing top 8 rows



In [None]:

df_rename=df_csv.withColumnRenamed("Rate","Rating").withColumnRenamed("Summary","Review")
df_swap=df_rename.columns
df_swap[0],df_swap[1] = df_swap[1],df_swap[0]
df_neww = df_rename.select(df_swap)
df_neww.show(5)




+--------------------+------+
|              Review|Rating|
+--------------------+------+
|great cooler exce...|     5|
|best budget 2 fit...|     5|
|the quality is go...|     3|
|very bad product ...|     1|
|       ok ok product|     3|
+--------------------+------+
only showing top 5 rows



In [None]:


from pyspark.sql.functions import col, trim, regexp_replace

df_trim = df_neww.withColumn(
    "Review",
    trim(regexp_replace(col("Review"), r"^[\s\u00A0]+", ""))
)

df_trim.show(5, truncate=False)


+----------------------------------------------------------------------------------------------+------+
|Review                                                                                        |Rating|
+----------------------------------------------------------------------------------------------+------+
|great cooler excellent air flow and for this price its so amazing and unbelievablejust love it|5     |
|best budget 2 fit cooler nice cooling                                                         |5     |
|the quality is good but the power of air is decent                                            |3     |
|very bad product its a only a fan                                                             |1     |
|ok ok product                                                                                 |3     |
+----------------------------------------------------------------------------------------------+------+
only showing top 5 rows



In [None]:
df_merged=df_processed_txt.unionByName(df_trim)
df_merged.count()

405052

# train test

In [None]:
train_data, test_data = df_merged.randomSplit([0.8, 0.2], seed=42)



In [None]:
print("Training Count: " + str(train_data.count()))
print("Testing Count: " + str(test_data.count()))

Training Count: 324344
Testing Count: 80708




```
# This is formatted as code
```

#logistic regression

In [None]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [None]:
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
#It takes your raw sentence and chops it into individual words (tokens).

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
#This converts words into Numbers.

In [None]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
#This checks for Uniqueness. It turns down the volume on boring words and turns up the volume on important words.

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="Rating", maxIter=10)
#This is the actual Math that learns to separate "Good" reviews from "Bad" ones.

In [None]:
from pyspark.sql.types import IntegerType

# 1. Fix the Nulls in 'Review' (from the previous step)
train_data = train_data.na.fill("", subset=["Review"])

# 2. Fix the String type in 'Rating' (The current error)
train_data = train_data.withColumn("Rating", train_data["Rating"].cast(IntegerType()))


In [None]:
# 1. Drop rows where the Rating is null
train_data = train_data.na.drop(subset=["Rating"])



In [None]:
train_data.count()

324342

In [None]:
train_data.show(5)

+--------------------+------+
|              Review|Rating|
+--------------------+------+
|   stars great hi...|     4|
|  brutality well ...|     2|
|  buying first  s...|     4|
|  cares movie big...|     2|
|  clips  mounting...|     2|
+--------------------+------+
only showing top 5 rows



In [None]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])
model = pipeline.fit(train_data)
#This just connects the pipes. It says: "First Tokenize, THEN Hash, THEN IDF, THEN Logistic Regression.

In [None]:
# 2. Fit the pipeline
model = pipeline.fit(train_data)

# test data

checking null values

In [None]:
from pyspark.sql.functions import col, sum

# This command sums up the number of nulls in every single column
test_data.select([sum(col(c).isNull().cast("int")).alias(c) for c in test_data.columns]).show()

+------+------+
|Review|Rating|
+------+------+
|     3|     0|
+------+------+



In [None]:
#  Drop the 3 rows where Review is null
train_data = train_data.na.drop(subset=["Review"])

In [None]:
test_data.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Rating: string (nullable = true)



In [None]:
#  Drop bad rows (just like we did for train_data)
test_data = test_data.na.drop(subset=["Review"])

# Convert Rating to Numbers
test_data = test_data.withColumn("Rating", test_data["Rating"].cast(IntegerType()))

#  Drop any rows where Rating conversion failed
test_data = test_data.na.drop(subset=["Rating"])

In [None]:
# The model uses the patterns it learned to guess the ratings for test_data
predictions = model.transform(test_data)

# Show the results
# Review = Original Text
# Rating = The Correct Answer
# prediction = What the Model Guessed
predictions.select("Review", "Rating", "prediction").show(10)

+--------------------+------+----------+
|              Review|Rating|prediction|
+--------------------+------+----------+
|   unrated got mi...|     2|       4.0|
|   well started r...|     2|       2.0|
|   youre able sol...|     2|       2.0|
|  dont really kno...|     2|       2.0|
|  fascinating ter...|     4|       4.0|
|  stars picks ori...|     4|       2.0|
| absolutely excel...|     4|       4.0|
| actually cd star...|     2|       2.0|
| again book beati...|     4|       4.0|
| amount people kn...|     2|       4.0|
+--------------------+------+----------+
only showing top 10 rows



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Rating",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 74.66%


In [None]:
from pyspark.sql import Row

# 1. Type your custom sentence here
my_review_text = "This product okay"

# 2. Convert it into a Spark DataFrame (matching your training data format)
# We use a dummy rating of 0 because we don't know the answer yet.
my_data = spark.createDataFrame([
    Row(Review=my_review_text, Rating=0)
])

# 3. Ask the model to predict
result = model.transform(my_data)

# 4. Show the result
# 'prediction' is the model's guess (e.g., 1.0, 2.0, 5.0)
result.select("Review", "prediction").show(truncate=False)

+-----------------+----------+
|Review           |prediction|
+-----------------+----------+
|This product okay|5.0       |
+-----------------+----------+

