In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# Create a Spark session
spark = SparkSession.builder.master("local").appName("SteamReviewLR").getOrCreate()

In [3]:
# Read the CSV file into a Spark DataFrame
df = spark.read.csv('C:/Users/jensd/OneDrive/Documenten/Jens/Advanced Analytics/Assignment 3/dataset.csv', header=True, inferSchema=True)

In [4]:
# Take a subset
df = df.limit(100000)

In [5]:
# Remove missing values
df = df.na.drop()

In [6]:
# Replace null values with empty strings
df = df.na.fill('')

In [7]:
# Select the relevant columns and rename them if needed
data = df.select('review_text', 'review_score')
data = data.withColumnRenamed('review_score', 'label')

In [8]:
# Define a user-defined function (UDF) to convert label values to integers
label_to_int = udf(lambda label: 1 if label == 1 else 0, IntegerType())

In [9]:
# Apply the UDF to convert label values to integers
data = data.withColumn('label_numeric', label_to_int('label'))

In [10]:
# Tokenize the review_text column
tokenizer = Tokenizer(inputCol='review_text', outputCol='tokens')

# Remove stop words from tokens
stopwords_remover = StopWordsRemover(inputCol='tokens', outputCol='filtered_tokens')

# Create the feature vector using the filtered_tokens column
count_vectorizer = CountVectorizer(inputCol='filtered_tokens', outputCol='features')

In [11]:
# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Create the sentiment analysis model pipeline
lr = LogisticRegression(featuresCol='features', labelCol='label_numeric')
pipeline = Pipeline(stages=[tokenizer, stopwords_remover,count_vectorizer, lr])

In [12]:
# Train the model
model = pipeline.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Create an evaluator instance
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
print("Accuracy:", accuracy)

Accuracy: 0.8507835113284758


In [13]:
model.save('LRmodel')

In [14]:
spark.stop()