# **Step 1: Initialize Spark Session**
First, set up your Spark environment to handle large-scale data processing:

In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Amazon Review Analysis") \
    .master("local[*]")\
    .config("spark.executor.memory", "20g") \
    .config("spark.driver.memory", "20g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "20g") \
    .config("spark.default.parallelism", "100") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()

# Step 2: Load the data
Read the dataset from a file, which could be stored on your Google Drive:

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Optimizing Load Time
To potentially reduce load time and improve performance:

Avoid Schema Inference: Specify the schema explicitly instead of using inferSchema=True. This avoids the need for Spark to read through the dataset to infer the schema.

In [32]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, LongType
schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("star_rating", LongType(), True),
    StructField("full_text", StringType(), True),
])
training_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("star_rating", LongType(), True),
    StructField("full_text", StringType(), True),
    StructField("sentiment", LongType(), True),
])

In [33]:
# Path to your dataset
data_path = '/content/drive/MyDrive/Big Data Project/sentiment.parquet'
# Load the dataset
df = spark.read.option("treatEmptyValuesAsNulls", "true").schema(training_schema).parquet(data_path)


In [34]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


# Step 3:TF-IDF

In [35]:
df = df.repartition(2000)
df.count()

120000

In [36]:
df.printSchema()
df = df.withColumn("star_rating", col("star_rating").cast("string"))

root
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: long (nullable = true)
 |-- full_text: string (nullable = true)
 |-- sentiment: long (nullable = true)



In [37]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import col

# Tokenize text
tokenizer = RegexTokenizer(inputCol="full_text", outputCol="words", pattern="\\W")
df = tokenizer.transform(df)

# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df = remover.transform(df)

# Apply Hashing TF and IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=5000)
featurizedData = hashingTF.transform(df)
featurizedData.cache()
featurizedData = featurizedData.repartition(20000)


idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
df = idfModel.transform(featurizedData)


# Step 5: Assign Model to Each Language
For implementing NLP tasks across different languages using BERT models, consider using specific models tailored to each language to enhance performance. For instance:

English: Utilize BERT for general NLP tasks including sentiment analysis.
Spanish: Use BETO, a BERT model trained on Spanish text.
French: CamemBERT is ideal, being specifically trained on French text.
German: German BERT caters to tasks involving German language.
Chinese: BERT-base Chinese is pre-trained on Chinese text.
Arabic: AraBERT is designed for Arabic text.
Russian: RuBERT is suitable for Russian language tasks.
Portuguese: BERTimbau focuses on Brazilian Portuguese.
Dutch: RobBERT, a RoBERTa-based model, is optimized for Dutch.
Italian: AlBERTo is trained on Italian social media text.


In [40]:
from transformers import BertTokenizer, BertModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
import torch

tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = BertModel.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Update the function to convert the output directly to a Spark dense vector
def create_bert_embeddings(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    # Convert the tensor to a list and then to a Dense Vector
    embeddings = output.last_hidden_state[:, 0, :].squeeze().tolist()  # Ensure it's a flat list
    return Vectors.dense(embeddings)

# Define the UDF with the proper return type
udf_bert_embeddings = udf(create_bert_embeddings, VectorUDT())

# Apply the function to each text item
df = df.withColumn("bertFeatures", udf_bert_embeddings(col("full_text")))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [41]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors
import numpy as np

# Define a UDF to convert numpy arrays to VectorUDT
def to_vector(col):
    if isinstance(col, np.ndarray):
        return Vectors.dense(col.flatten())
    return Vectors.dense(col)

# Register the UDF
to_vector_udf = udf(to_vector, VectorUDT())

# Assuming 'bert_output' is the column containing the BERT embeddings as numpy arrays
df = df.withColumn("bert_vector", to_vector_udf("bertFeatures"))


In [42]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["features", "bert_vector"],
    outputCol="combinedFeatures"
)
final_df = assembler.transform(df)

In [44]:
from pyspark.ml.classification import LogisticRegression

# Define the Logistic Regression model
lr = LogisticRegression(
    featuresCol='combinedFeatures',  # Use the combined features
    labelCol='sentiment',                # Ensure you have a label column in your DataFrame
    maxIter=10,
    regParam=0.01
)

# Fit the model on the transformed DataFrame
lr_model = lr.fit(final_df)

# Use the model to make predictions
predictions = lr_model.transform(final_df)

# Show predictions
predictions.select("full_text", "prediction").show()

+--------------------+----------+
|           full_text|prediction|
+--------------------+----------+
|Elizabeth Berg co...|       3.0|
|I was very please...|       4.0|
|I prefer an over-...|       3.0|
|Chris Elliot join...|       3.0|
|Five Stars. Was j...|       4.0|
|Works as expected...|       3.0|
|I like the produc...|       4.0|
|Stopped working w...|       0.0|
|American Psycho.....|       0.0|
|NOT a True Soundt...|       3.0|
|Racecar dads rejo...|       3.0|
|great camera, gre...|       2.0|
|zipper is awesome...|       4.0|
|A very impressive...|       4.0|
|Delicious perfect...|       4.0|
|Metal broke after...|       3.0|
|Almost everything...|       2.0|
|boxes need to be ...|       1.0|
|Just started taki...|       4.0|
|Four Stars. Like ...|       3.0|
+--------------------+----------+
only showing top 20 rows

