In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import sqlite3
import pandas as pd
import numpy as np
import warnings
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
warnings.filterwarnings("ignore")
from textblob import Word

[nltk_data] Downloading package stopwords to /home/mine44/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mine44/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mine44/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/22 18:50:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#Load imdb_movies and imdb_marvel data from database
conn = sqlite3.connect("imdb_movies.sqlite")
imdb_movies = spark.createDataFrame(pd.read_sql_query("SELECT * from imdb_movies", conn))
imdb_marvel = spark.createDataFrame(pd.read_sql_query("SELECT * from imdb_marvel", conn))

In [4]:
imdb_movies.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---------+---------+--------------------+---------+--------------+--------------------+--------------------+------------+--------------------+-------------+--------+
|   tconst|titleType|        primaryTitle|startYear|runtimeMinutes|              genres|            overview|release_date|             tagline|averageRating|numVotes|
+---------+---------+--------------------+---------+--------------+--------------------+--------------------+------------+--------------------+-------------+--------+
|tt0015400|    movie| The Thief of Bagdad|     1924|           155|Adventure,Family,...|A recalcitrant th...|     3/18/24|"""Happiness Must...|          7.7|    6324|
|tt0019563|    movie|    West of Zanzibar|     1928|            65|       Drama,Mystery|Silent film (with...|    11/24/28|                null|          7.2|    1817|
|tt0022958|    movie|         Grand Hotel|     1932|           112|       Drama,Romance|Guests at a posh ...|     5/25/32|Thank The Stars F...|          7.3|   19576

                                                                                

In [5]:
imdb_marvel.show()

+--------------------+--------------+--------------------+--------------------+-----------------+--------------------+-------------+--------+
|        primaryTitle|runtimeMinutes|              genres|            overview|     release_date|             tagline|averageRating|numVotes|
+--------------------+--------------+--------------------+--------------------+-----------------+--------------------+-------------+--------+
|Black Panther: Wa...|           161|Action,Adventure,...|The people of Wak...|November 11, 2022|             Forever|          7.3|   74000|
|       Black Panther|           134|Action,Adventure,...|T'Challa, heir to...|February 16, 2018| Long live the king.|          7.3|  762000|
|Thor: Love and Th...|           118|Action,Adventure,...|Thor enlists the ...|     July 8, 2022|Not every god has...|          6.3|  309000|
|She-Hulk: Attorne...|            35|Action,Adventure,...|Jennifer Walters ...|  August 18, 2022|You'll like her w...|          5.2|  159000|
|Spide

In [6]:
#Remove movies that have empty content
imdb_movies = imdb_movies.dropna(subset=['genres','overview','tagline'],how='all')
#Fill missing values of content
imdb_movies = imdb_movies.fillna(value='',subset=['overview','tagline'])
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','None',''))
#Remove \n character
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','[\n]+',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','[\n]+',' '))
#Remove \r character
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','[\r]+',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','[\r]+',' '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','[\r]+',' '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','[\r]+',' '))
#Remove &amp; characters
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','&amp;',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','&amp;',' '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','&amp;',' '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','&amp;',' '))
#Remove URL 
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','http\S+',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','http\S+',' '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','http\S+',' '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','http\S+',' '))
#Replace ! with .
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','!','.'))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','!','.'))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','!','.'))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','!','.'))
#Replace multilple points with single point
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','[.]+','. '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','[.]+','. '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','[.]+','. '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','[.]+','. '))
#Remove special characters
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','[-—–_~%&\\\/;:"“”‘’•|,<>?#@àè\^\(\)\*\'\[\]]+',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','[-—–_~%&\\\/;:"“”‘’•|,<>?#@àè\^\(\)\*\'\[\]]+',' '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','[-—–_~%&\\\/;:"“”‘’•|,<>?#@àè\^\(\)\*\'\[\]]+',' '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','[-—–_~%&\\\/;:"“”‘’•|,<>?#@àè\^\(\)\*\'\[\]]+',' '))
#Converting text to lower case
imdb_movies = imdb_movies.withColumn('overview',lower(imdb_movies['overview']))
imdb_movies = imdb_movies.withColumn('tagline',lower(imdb_movies['tagline']))
imdb_marvel = imdb_marvel.withColumn('overview',lower(imdb_marvel['overview']))
imdb_marvel = imdb_marvel.withColumn('tagline',lower(imdb_marvel['tagline']))
#Removing punctuation
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','[^\w\s]',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','[^\w\s]',' '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','[^\w\s]',' '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','[^\w\s]',' '))
#Removing stop words
stop_words = stopwords.words('english')
remove_stop_words_udf = udf(lambda x:" ".join(x for x in x.split() if x not in stop_words))
imdb_movies = imdb_movies.withColumn('overview',remove_stop_words_udf(imdb_movies['overview']))
imdb_movies = imdb_movies.withColumn('tagline',remove_stop_words_udf(imdb_movies['tagline']))
imdb_marvel = imdb_marvel.withColumn('overview',remove_stop_words_udf(imdb_marvel['overview']))
imdb_marvel = imdb_marvel.withColumn('tagline',remove_stop_words_udf(imdb_marvel['tagline']))
#Lemmatizing word
lemmatize_udf = udf(lambda x:' '.join([Word(word).lemmatize() for word in x.split()]))
imdb_movies = imdb_movies.withColumn('overview',lemmatize_udf(imdb_movies['overview']))
imdb_movies = imdb_movies.withColumn('tagline',lemmatize_udf(imdb_movies['tagline']))
imdb_marvel = imdb_marvel.withColumn('overview',lemmatize_udf(imdb_marvel['overview']))
imdb_marvel = imdb_marvel.withColumn('tagline',lemmatize_udf(imdb_marvel['tagline']))
#Remove redundant spaces
imdb_movies = imdb_movies.withColumn('overview',regexp_replace('overview','[\s]+',' '))
imdb_movies = imdb_movies.withColumn('tagline',regexp_replace('tagline','[\s]+',' '))
imdb_marvel = imdb_marvel.withColumn('overview',regexp_replace('overview','[\s]+',' '))
imdb_marvel = imdb_marvel.withColumn('tagline',regexp_replace('tagline','[\s]+',' '))
#Trim the space of texts
imdb_movies = imdb_movies.withColumn('overview',trim(imdb_movies['overview']))
imdb_movies = imdb_movies.withColumn('tagline',trim(imdb_movies['tagline']))
imdb_marvel = imdb_marvel.withColumn('overview',trim(imdb_marvel['overview']))
imdb_marvel = imdb_marvel.withColumn('tagline',trim(imdb_marvel['tagline']))

In [9]:
imdb_marvel.toPandas().to_csv('imdb_marvel_cleaned.csv',sep='|',index=False)
imdb_movies.toPandas().to_csv('imdb_movies_cleaned.csv',sep='|',index=False)

                                                                                