In [1]:
import re
import numpy as np
from pyspark.sql import functions as F
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# Load and preprocess data

In [2]:
import numpy as np
from pyspark.sql import functions as F
tFile="data\IMDB Dataset.csv"
df = spark.read.csv(tFile,header=True)
df.show(3)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
+--------------------+---------+
only showing top 3 rows



In [3]:
df = df.sample(.3)
#df= df.where(F.col("sentiment")=="positive")

In [4]:
df.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive| 7550|
| negative| 7425|
+---------+-----+



In [5]:
df = df.withColumn("text_h", F.regexp_replace(F.col("text"), r'<[^>]+>', ""));

In [6]:
# Remove the special chars. Only lettere will reamin.
df = df.withColumn("text_c", F.regexp_replace(F.regexp_replace(F.col("text_h"), "[^a-zA-Z ]", ""),' +'," "))

In [7]:
df.show(20)

+--------------------+---------+--------------------+--------------------+
|                text|sentiment|              text_h|              text_c|
+--------------------+---------+--------------------+--------------------+
|Encouraged by the...| negative|Encouraged by the...|Encouraged by the...|
|So im not a big f...| negative|So im not a big f...|So im not a big f...|
|The cast played S...| negative|The cast played S...|The cast played S...|
|Some films just s...| positive|Some films just s...|Some films just s...|
|This movie made i...| negative|This movie made i...|This movie made i...|
|What an absolutel...| positive|What an absolutel...|What an absolutel...|
|'War movie' is a ...| positive|'War movie' is a ...|War movie is a Ho...|
|I watched this fi...| negative|I watched this fi...|I watched this fi...|
|I bought this fil...| negative|I bought this fil...|I bought this fil...|
|Of all the films ...| negative|Of all the films ...|Of all the films ...|
|As a disclaimer, ...| po

In [8]:
from pyspark.ml.feature import Tokenizer, CountVectorizer,IDF
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.ml.clustering import LDA

# Create LDA model wiht two topics

In [9]:
# Text preprocessin pipeline
tokenizer = Tokenizer(inputCol="text_c", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
#countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features", vocabSize=500)
#countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features", vocabSize=500,minDF=10, maxDF=1000)
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000)

idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="features")
pipeline = Pipeline(stages=[tokenizer,remover, countVectorizer,idf])
data_model = pipeline.fit(df)

In [11]:
vocabulary = data_model.stages[2].vocabulary
print(vocabulary[:100])

['movie', 'film', 'one', 'like', 'good', 'even', 'time', 'really', 'see', 'story', 'much', 'well', 'get', 'bad', 'great', 'also', 'people', 'first', 'dont', 'movies', 'made', 'films', 'make', 'way', 'characters', 'think', 'seen', 'watch', 'character', 'never', 'two', 'many', 'love', 'show', 'plot', 'little', 'best', 'acting', 'know', 'ever', 'life', 'better', 'scene', 'man', 'say', 'scenes', 'still', 'end', 'something', 'go', 'real', 'back', 'actors', 'im', 'watching', 'thing', 'didnt', 'doesnt', 'though', 'years', 'actually', 'makes', 'funny', 'nothing', 'look', 'another', 'find', 'lot', 'going', 'work', 'old', 'every', 'cant', 'new', 'part', 'us', 'director', 'pretty', 'quite', 'thats', 'want', 'around', 'take', 'cast', 'seems', 'fact', 'got', 'things', 'big', 'young', 'thought', 'give', 'enough', 'isnt', 'world', 'may', 'music', 'horror', 'long', 'interesting']


In [12]:
dataset = data_model.transform(df)
dataset.show(10)

+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|sentiment|              text_h|              text_c|               words|            filtered|          features_c|            features|
+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Encouraged by the...| negative|Encouraged by the...|Encouraged by the...|[encouraged, by, ...|[encouraged, posi...|(1000,[1,2,8,13,2...|(1000,[1,2,8,13,2...|
|So im not a big f...| negative|So im not a big f...|So im not a big f...|[so, im, not, a, ...|[im, big, fan, bo...|(1000,[0,1,2,3,4,...|(1000,[0,1,2,3,4,...|
|The cast played S...| negative|The cast played S...|The cast played S...|[the, cast, playe...|[cast, played, sh...|(1000,[0,29,38,44...|(1000,[0,29,38,44...|
|Some films just s...| positive|Some films jus

In [13]:
# Find two topics
lda = LDA(k=2, maxIter=20)
model = lda.fit(dataset)


In [14]:
# Print the LDA transformation matrix
model.topicsMatrix()

DenseMatrix(1000, 2, [1867.8688, 2861.3647, 1780.3746, 1278.9474, 1376.8069, 1131.986, 1406.4946, 1193.581, ..., 313.8083, 382.3281, 226.4952, 24.3532, 309.7665, 240.8443, 93.8796, 590.9369], 0)

In [15]:
# Describe topics
topics = model.describeTopics(5)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------------+---------------------------------------------------------------------------------------------------------------+
|topic|termIndices      |termWeights                                                                                                    |
+-----+-----------------+---------------------------------------------------------------------------------------------------------------+
|0    |[1, 9, 14, 40, 0]|[0.0059993706591617785, 0.00421333168389642, 0.004075364779163437, 0.003961109057541186, 0.0039163260799191905]|
|1    |[0, 1, 13, 3, 7] |[0.006357085823521961, 0.005371918531223365, 0.005195876911487453, 0.004647565655955355, 0.0041393613297162225]|
+-----+-----------------+---------------------------------------------------------------------------------------------------------------+



In [16]:
# Print most important words per topic
topics = model.describeTopics(15)
for r in topics.select("termIndices").collect():
    rez = []
    for l in r:
        for i in l:
            rez.append(vocabulary[i])
    print(rez[:15])

['film', 'story', 'great', 'life', 'movie', 'also', 'one', 'love', 'best', 'two', 'man', 'show', 'characters', 'well', 'young']
['movie', 'film', 'bad', 'like', 'really', 'even', 'good', 'dont', 'get', 'one', 'movies', 'see', 'plot', 'people', 'say']


# Create LDA model wiht ten topics

In [23]:
# Text preprocessin pipeline
tokenizer = Tokenizer(inputCol="text_c", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
#countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=500)
# Run 1: Use all the words
# countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000)
# Run 2: Discard the very frequent words
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000,minDF=10, maxDF=1000)

idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="features")
pipeline = Pipeline(stages=[tokenizer,remover, countVectorizer,idf])
data_model = pipeline.fit(df)

In [24]:
vocabulary = data_model.stages[2].vocabulary
print(vocabulary[:100])

['book', 'version', 'american', 'shot', 'john', 'audience', 'read', 'night', 'war', 'completely', 'death', 'high', 'youre', 'nice', 'fan', 'poor', 'house', 'year', 'simply', 'along', 'use', 'short', 'less', 'friends', 'kids', 'black', 'second', 'used', 'either', 'given', 'men', 'home', 'line', 'stupid', 'half', 'mind', 'dead', 'need', 'rest', 'classic', 'help', 'enjoy', 'father', 'wife', 'wrong', 'star', 'truly', 'try', 'start', 'production', 'couple', 'understand', 'sex', 'recommend', 'boring', 'terrible', 'next', 'performances', 'wonderful', 'moments', 'keep', 'women', 'remember', 'getting', 'mean', 'small', 'video', 'full', 'couldnt', 'budget', 'others', 'gives', 'tell', 'camera', 'human', 'school', 'playing', 'awful', 'let', 'often', 'came', 'guys', 'hollywood', 'name', 'definitely', 'absolutely', 'early', 'lines', 'head', 'liked', 'case', 'perfect', 'episode', 'certainly', 'dialogue', 'top', 'piece', 'perhaps', 'stars', 'sound']


In [25]:
dataset = data_model.transform(df)
dataset.show(10)

+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|sentiment|              text_h|              text_c|               words|            filtered|          features_c|            features|
+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Encouraged by the...| negative|Encouraged by the...|Encouraged by the...|[encouraged, by, ...|[encouraged, posi...|(1000,[22,46,54,7...|(1000,[22,46,54,7...|
|So im not a big f...| negative|So im not a big f...|So im not a big f...|[so, im, not, a, ...|[im, big, fan, bo...|(1000,[13,14,19,2...|(1000,[13,14,19,2...|
|The cast played S...| negative|The cast played S...|The cast played S...|[the, cast, playe...|[cast, played, sh...|(1000,[60,87,180,...|(1000,[60,87,180,...|
|Some films just s...| positive|Some films jus

In [27]:
# Find two topics
lda = LDA(k=10, maxIter=20)
model = lda.fit(dataset)

In [28]:
# Describe topics
topics = model.describeTopics(5)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-------------------------+----------------------------------------------------------------------------------------------------------------+
|topic|termIndices              |termWeights                                                                                                     |
+-----+-------------------------+----------------------------------------------------------------------------------------------------------------+
|0    |[167, 219, 867, 916, 31] |[0.008098707900609042, 0.007473186794659393, 0.0060857619621374846, 0.005692707050313683, 0.0055632589865849645]|
|1    |[360, 517, 744, 1, 796]  |[0.008742877521683198, 0.007603318446062859, 0.007182775413896774, 0.007102547163107844, 0.0065358513794967375] |
|2    |[92, 383, 344, 446, 8]   |[0.011672105610638785, 0.009681715486126403, 0.007810046644431173, 0.006987408293900571, 0.006207827393365973]  |
|3    |[614, 69, 535, 300, 473] |[0.006308397469852298, 0.0062977538

In [29]:
# Print most important words per topic
topics = model.describeTopics(10)
for r in topics.select("termIndices").collect():
    rez = []
    for l in r:
        for i in l:
            rez.append(vocabulary[i])
    print(rez[:15])

['killer', 'police', 'cat', 'slasher', 'home', 'die', 'tries', 'kids', 'murder', 'ill']
['musical', 'dance', 'alien', 'version', 'dancing', 'song', 'number', 'remake', 'mean', 'sound']
['episode', 'episodes', 'documentary', 'realistic', 'war', 'cinema', 'air', 'dull', 'youll', 'political']
['match', 'budget', 'scifi', 'power', 'season', 'science', 'disturbing', 'master', 'sex', 'low']
['book', 'joe', 'island', 'father', 'read', 'game', 'baby', 'wonderful', 'relationship', 'brother']
['jokes', 'jane', 'jack', 'band', 'audience', 'easy', 'sexual', 'sam', 'given', 'train']
['war', 'christmas', 'history', 'subject', 'message', 'understand', 'battle', 'chris', 'certain', 'nature']
['school', 'waste', 'terrible', 'awful', 'stupid', 'pathetic', 'monster', 'game', 'couldnt', 'bunch']
['de', 'art', 'house', 'la', 'told', 'tom', 'english', 'american', 'water', 'simply']
['lee', 'disney', 'creepy', 'peter', 'dr', 'computer', 'star', 'eye', 'songs', 'race']


# Topic classification

In [30]:
# Shows the result
transformed = model.transform(dataset)
transformed.select("text_c","topicDistribution").show(5)

+--------------------+--------------------+
|              text_c|   topicDistribution|
+--------------------+--------------------+
|Encouraged by the...|[0.00134266902034...|
|So im not a big f...|[6.04925096092124...|
|The cast played S...|[0.00176946977597...|
|Some films just s...|[0.00231624133659...|
|This movie made i...|[0.32533048337194...|
+--------------------+--------------------+
only showing top 5 rows



In [31]:
from pyspark.sql.functions import udf
@udf
def vect_argmax(row):
    row_arr = row.toArray()
    max_pos = np.argmax(row_arr)
    return(int(max_pos))
transformed1 = transformed.withColumn("argmax",vect_argmax(F.col('topicDistribution')))

In [32]:
transformed1.select("text_c","argmax").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------