In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

In [72]:
import re
import sys
import numpy as np
from pyspark.sql import functions as F
tFile="data\IMDB Dataset.csv.bz2"
df0 = spark.read.csv(tFile,header=True)
df0.show(3)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
+--------------------+---------+
only showing top 3 rows



In [73]:
# Convert sentiment to numbers positive =1, negative =0
df0 = df0.withColumn("label", F.when(F.col("sentiment")=="positive",1).otherwise(0)).cache()

In [74]:
# Remove html tags from text
df0 = df0.withColumn("text_c", F.regexp_replace(F.col("text"), r'<[^>]+>', ""));

# Data 

In [75]:
#Sample the data for faster model training (use the full dataset in reality)
df0 = df0.sample(0.25, seed=200)
# Split the data in train and test (80%-20%)
df, test = df0.randomSplit(weights=[0.8,0.2], seed=200)

In [76]:
df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 4984|
|    0| 5070|
+-----+-----+



In [77]:
test.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 1239|
|    0| 1280|
+-----+-----+



In [78]:
# Create a weight of each class
from pyspark.sql import functions as F
p_weight = df.filter('label == 1').count()/ df.count()
n_weight = df.filter('label == 0').count()/ df.count()
print(n_weight, p_weight)

0.5042769047145415 0.49572309528545855


In [79]:
df = df.withColumn("weight", F.when(F.col("label")==1,n_weight).otherwise(p_weight))
df.show(5)

+--------------------+---------+-----+--------------------+-------------------+
|                text|sentiment|label|              text_c|             weight|
+--------------------+---------+-----+--------------------+-------------------+
| Domino  has been...| positive|    1| Domino  has been...| 0.5042769047145415|
| It had to be You...| negative|    0| It had to be You...|0.49572309528545855|
| Så som i himmele...| positive|    1| Så som i himmele...| 0.5042769047145415|
| While sporadical...| negative|    0| While sporadical...|0.49572309528545855|
|!!!!! POSSIBLE SP...| negative|    0|!!!!! POSSIBLE SP...|0.49572309528545855|
+--------------------+---------+-----+--------------------+-------------------+
only showing top 5 rows



In [80]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import PCA
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml import Pipeline

# Data transformation

In [55]:
# Preprocessing pipeline
tokenizer = Tokenizer(inputCol="text_c", outputCol="words",)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
# hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="rawFeatures",numFeatues=5000)
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="rawFeatures", vocabSize=500)
idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="featuresIDF")
pipeline_p = Pipeline(stages=[tokenizer,remover, countVectorizer, idf])

In [56]:
# Train the model
data_model = pipeline_p.fit(df)

In [57]:

transformed_data = data_model.transform(df)
transformed_data.show(5)

+--------------------+---------+-----+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|                text|sentiment|label|              text_c|             weight|               words|            filtered|         rawFeatures|         featuresIDF|
+--------------------+---------+-----+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
| Domino  has been...| positive|    1| Domino  has been...| 0.5042769047145415|[, domino, , has,...|[, domino, , wide...|(500,[0,3,6,11,12...|(500,[0,3,6,11,12...|
| It had to be You...| negative|    0| It had to be You...|0.49572309528545855|[, it, had, to, b...|[, another, sign,...|(500,[0,6,12,19,2...|(500,[0,6,12,19,2...|
| Så som i himmele...| positive|    1| Så som i himmele...| 0.5042769047145415|[, så, som, i, hi...|[, så, som, himme...|(500,[1,7,8,11,12...|(500,[1,7,8,11,12...|
| While sporadic

In [58]:
# Transfomr the test data
transformed_test = data_model.transform(test)
transformed_test.show(5)

+--------------------+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|sentiment|label|              text_c|               words|            filtered|         rawFeatures|         featuresIDF|
+--------------------+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|!!! Spoiler alert...| negative|    0|!!! Spoiler alert...|[!!!, spoiler, al...|[!!!, spoiler, al...|(500,[1,2,3,5,7,9...|(500,[1,2,3,5,7,9...|
|'Loulou' delights...| positive|    1|'Loulou' delights...|['loulou', deligh...|['loulou', deligh...|(500,[1,2,12,14,2...|(500,[1,2,12,14,2...|
|'Presque rien' is...| positive|    1|'Presque rien' is...|['presque, rien',...|['presque, rien',...|(500,[0,1,5,11,17...|(500,[0,1,5,11,17...|
|'The Luzhin Defen...| positive|    1|'The Luzhin Defen...|['the, luzhin, de...|['the, luzhin, de...|(500,[1,4,10,16,3...|(500,[1,4,10,1

In [59]:
# Print the sages of the pipeline
data_model.stages

[Tokenizer_ccf7c0193f78,
 StopWordsRemover_4d8575b368a4,
 CountVectorizerModel: uid=CountVectorizer_3f04897c75fb, vocabularySize=500,
 IDFModel: uid=IDF_8b57b0f40bb4, numDocs=10054, numFeatures=500]

In [60]:
# Get the vocabulary of the CountVectroizer
data_model.stages[2].vocabulary[:20]

['movie',
 'film',
 'one',
 'like',
 'good',
 'even',
 'really',
 'see',
 '-',
 'get',
 'much',
 'story',
 'also',
 'time',
 'make',
 'first',
 'great',
 'people',
 'bad',
 'made']

# Metics for the model

In [61]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MultilabelMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

def m_metrics(ml_model,test_data):
    predictions = ml_model.transform(test_data).cache()
    predictionAndLabels = predictions.select("label","prediction").rdd.map(lambda x: (float(x[0]), float(x[1]))).cache()
    
    # Print some predictions vs labels
    print(predictionAndLabels.take(10))
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print(f"Precision = {precision:.3f} Recall = {recall:.3f} F1 Score = {f1Score:.3f}")

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    print("Confusion matrix \n", metrics.confusionMatrix().toArray().astype(int))

def m_metrics_l(ml_model,test_data):
    predictions = ml_model.transform(test_data).cache()
    predictionAndLabels = predictions.select("label","prediction").rdd.map(lambda x: (float(x[0]), float(x[1]))).cache()
    
    # Print some predictions vs labels
    # print(predictionAndLabels.take(10))
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print(f"Precision = {precision:.4f} Recall = {recall:.4f} F1 Score = {f1Score:.4f}")
    print("Confusion matrix \n", metrics.confusionMatrix().toArray().astype(int))

# ML Model

In [62]:
import time
#selector = ChiSqSelector(numTopFeatures=200, featuresCol=idf.getOutputCol(), outputCol="features", labelCol="label")
cassifier = LogisticRegression(maxIter=5, featuresCol = "featuresIDF", weightCol="weight")
#cassifier = LinearSVC(maxIter=10,  weightCol="weight")
#cassifier = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), weightCol="weight")
start = time.time()
#cassifier = GBTClassifier(maxIter=50, featuresCol = "featuresIDF", weightCol="weight")
pipeline = Pipeline(stages=[cassifier])
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 12.15s.
Precision = 0.8289 Recall = 0.7967 F1 Score = 0.8125
Confusion matrix 
 [[1018  212]
 [ 262 1027]]
Total time 23.80s.


In [68]:
cassifier = GBTClassifier(maxIter=10, featuresCol = "featuresIDF", maxDepth=10)
pipeline = Pipeline(stages=[cassifier])
start = time.time()
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 56.40s.
Precision = 0.8144 Recall = 0.7007 F1 Score = 0.7533
Confusion matrix 
 [[ 849  230]
 [ 431 1009]]
Total time 64.66s.


In [64]:
cassifier = LinearSVC(maxIter=10, featuresCol = "featuresIDF", weightCol="weight")
pipeline = Pipeline(stages=[cassifier])
start = time.time()
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 11.93s.
Precision = 0.8313 Recall = 0.7960 F1 Score = 0.8133
Confusion matrix 
 [[1016  209]
 [ 264 1030]]
Total time 20.13s.


In [69]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers = [500, 100, 20, 2]

# create the trainer and set its parameters
cassifier = MultilayerPerceptronClassifier(maxIter=10, layers=layers,featuresCol = "featuresIDF", blockSize=128, seed=1234)
pipeline = Pipeline(stages=[cassifier])
start = time.time()
print(f"Training started.")
model = pipeline.fit(transformed_data)
print(f"Model created in {time.time()-start:.2f}s.")
m_metrics_l(model,transformed_test)
print(f"Total time {time.time()-start:.2f}s.")

Training started.
Model created in 17.87s.
Precision = 0.7813 Recall = 0.8020 F1 Score = 0.7915
Confusion matrix 
 [[1041  271]
 [ 239  968]]
Total time 27.15s.
