In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

from shared.constants import DatasetPath

In [51]:
DATASET = DatasetPath('imdb-5000-movie-dataset')

In [52]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

In [53]:
df = (
    spark.read.csv(DATASET.raw_str('movie_metadata.csv'), header=True, inferSchema=True)
        .withColumn('id', F.regexp_extract('movie_imdb_link', 'http:\/\/www.imdb.com\/title\/([A-z0-9]+)\/(.*)', 1))
        .withColumn('plot_keywords', F.split('plot_keywords', '\|'))
        .withColumn('genres', F.split('genres', '\|'))
)
df.head(5)

[Row(color='Color', director_name='James Cameron', num_critic_for_reviews=723, duration=178, director_facebook_likes=0, actor_3_facebook_likes=855, actor_2_name='Joel David Moore', actor_1_facebook_likes=1000, gross=760505847, genres=['Action', 'Adventure', 'Fantasy', 'Sci-Fi'], actor_1_name='CCH Pounder', movie_title='Avatar\xa0', num_voted_users=886204, cast_total_facebook_likes=4834, actor_3_name='Wes Studi', facenumber_in_poster=0, plot_keywords=['avatar', 'future', 'marine', 'native', 'paraplegic'], movie_imdb_link='http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1', num_user_for_reviews=3054, language='English', country='USA', content_rating='PG-13', budget=237000000, title_year=2009, actor_2_facebook_likes=936, imdb_score=7.9, aspect_ratio=1.78, movie_facebook_likes=33000, id='tt0499549'),
 Row(color='Color', director_name='Gore Verbinski', num_critic_for_reviews=302, duration=169, director_facebook_likes=563, actor_3_facebook_likes=1000, actor_2_name='Orlando Bloom', actor_1

In [54]:
df_nodes_persons = (
    df.select(F.col('actor_1_name').alias('name'), F.col('actor_1_facebook_likes').alias('facebook_likes'))
        .union(df.select(F.col('actor_2_name').alias('name'), F.col('actor_2_facebook_likes').alias('facebook_likes')))
        .union(df.select(F.col('actor_3_name').alias('name'), F.col('actor_3_facebook_likes').alias('facebook_likes')))
        .union(df.select(F.col('director_name').alias('name'), F.col('director_facebook_likes').alias('facebook_likes')))
        .dropDuplicates(['name'])
        .filter(F.col('name').isNotNull())
        .withColumn('id', F.col('name'))
)
print(df_nodes_persons.count())
df_nodes_persons.head(5)

8491


[Row(name='Doug Walker', facebook_likes=131, id='Doug Walker'),
 Row(name='Oliver Platt', facebook_likes=1000, id='Oliver Platt'),
 Row(name='Snoop Dogg', facebook_likes=881, id='Snoop Dogg'),
 Row(name='Stephen Root', facebook_likes=939, id='Stephen Root'),
 Row(name='Laurence Olivier', facebook_likes=1000, id='Laurence Olivier')]

In [55]:
df_nodes_genres = (
    df.select(F.explode('genres').alias('name'))
        .dropDuplicates(['name'])
        .withColumn('id', F.col('name'))
        .filter(F.col('id').isNotNull())
)
print(df_nodes_genres.count())
df_nodes_genres.head(5)

26


[Row(name='Crime', id='Crime'),
 Row(name='Romance', id='Romance'),
 Row(name='Thriller', id='Thriller'),
 Row(name='Adventure', id='Adventure'),
 Row(name='Drama', id='Drama')]

In [56]:
df_nodes_keywords = (
    df.select(F.explode('plot_keywords').alias('name'))
        .groupby('name')
        .count().filter('count > 1')
        .withColumn('id', F.col('name'))
        .filter(F.col('id').isNotNull())
)
print(df_nodes_keywords.count())
df_nodes_keywords.head(5)

3178


[Row(name='ingratitude', count=2, id='ingratitude'),
 Row(name='title appears in writing', count=3, id='title appears in writing'),
 Row(name='space colony', count=2, id='space colony'),
 Row(name='travel', count=17, id='travel'),
 Row(name='ransom', count=9, id='ransom')]

In [57]:
df_nodes_movies = (
    df
        .withColumn('name', F.col('movie_title'))
        .drop(
        'genres', 'movie_title',
        'actor_1_name', 'actor_1_facebook_likes',
        'actor_2_name', 'actor_2_facebook_likes',
        'actor_3_name', 'actor_3_facebook_likes',
        'director_name', 'director_facebook_likes',
        'movie_imdb_link'
    ).withColumn('timestamp', F.to_timestamp(F.col('title_year').cast('string'), 'yyyy'))
    .filter(F.col('id').isNotNull())
    .dropDuplicates(['id'])
)
print(df_nodes_movies.count())
df_nodes_movies.head(5)

4919


[Row(color=' Black and White', num_critic_for_reviews=69, duration=123, gross=None, num_voted_users=10718, cast_total_facebook_likes=481, facenumber_in_poster=1, plot_keywords=['huguenot', 'intolerance', 'medicis', 'protestant', 'wedding'], num_user_for_reviews=88, language=None, country='USA', content_rating='Not Rated', budget=385907, title_year=1916, imdb_score=8.0, aspect_ratio=1.33, movie_facebook_likes=691, id='tt0006864', name="Intolerance: Love's Struggle Throughout the Ages\xa0", timestamp=datetime.datetime(1916, 1, 1, 0, 0)),
 Row(color=' Black and White', num_critic_for_reviews=1, duration=110, gross=3000000, num_voted_users=5, cast_total_facebook_likes=4, facenumber_in_poster=1, plot_keywords=['family relationships', 'gang', 'idler', 'poorhouse', 'thief'], num_user_for_reviews=1, language=None, country='USA', content_rating=None, budget=100000, title_year=1920, imdb_score=4.8, aspect_ratio=1.33, movie_facebook_likes=0, id='tt0011549', name='Over the Hill to the Poorhouse\xa

In [58]:
df_edges_acted_in = (
    df.select(F.col('id').alias('movie_id'), F.col('actor_1_name').alias('actor_name'))
        .union(df.select(F.col('id').alias('movie_id'), F.col('actor_2_name').alias('actor_name')))
        .union(df.select(F.col('id').alias('movie_id'), F.col('actor_3_name').alias('actor_name')))
        .dropDuplicates(['movie_id', 'actor_name'])
        .select(F.col('actor_name').alias('src'), F.col('movie_id').alias('dst'))
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
)
df_edges_acted_in.head(5)

[Row(src='Nicolas Cage', dst='tt0187078'),
 Row(src='Channing Tatum', dst='tt1578275'),
 Row(src='Jada Pinkett Smith', dst='tt0117218'),
 Row(src='Matt Damon', dst='tt1385826'),
 Row(src='Kevin Spacey', dst='tt0120623')]

In [59]:
df_edges_directed = (
    df.select(F.col('id').alias('movie_id'), F.col('director_name').alias('director_name'))
        .dropDuplicates(['movie_id', 'director_name'])
        .select(F.col('director_name').alias('src'), F.col('movie_id').alias('dst'))
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
)
df_edges_directed.head(5)

[Row(src='Judd Apatow', dst='tt1201167'),
 Row(src='Tony Bill', dst='tt0454824'),
 Row(src='Rod Lurie', dst='tt0272020'),
 Row(src='Tim Miller', dst='tt1431045'),
 Row(src='Anthony Hemingway', dst='tt0485985')]

In [60]:
df_edges_has_genre = (
    df.select(F.col('id').alias('movie_id'), F.explode('genres').alias('genre'))
        .dropDuplicates(['movie_id', 'genre'])
        .select(F.col('movie_id').alias('src'), F.col('genre').alias('dst'))
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
)
df_edges_has_genre.head(5)

[Row(src='tt1375666', dst='Adventure'),
 Row(src='tt0413267', dst='Adventure'),
 Row(src='tt0436339', dst='Animation'),
 Row(src='tt1680310', dst='Action'),
 Row(src='tt1320261', dst='Family')]

In [61]:
df_edges_has_keyword = (
    df.select(F.col('id').alias('movie_id'), F.explode('plot_keywords').alias('plot_keywords'))
        .dropDuplicates(['movie_id', 'plot_keywords'])
        .select(F.col('movie_id').alias('src'), F.col('plot_keywords').alias('dst'))
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
        .join(df_nodes_keywords.select('id'), F.col('id') ==  F.col('dst'), 'inner')
        .drop('id')
)
df_edges_has_keyword.head(5)

[Row(src='tt0371746', dst='billionaire'),
 Row(src='tt0938283', dst='kingdom'),
 Row(src='tt1229238', dst='race against time'),
 Row(src='tt0122151', dst='lapd'),
 Row(src='tt0177971', dst='storm')]

In [62]:
df_nodes_persons.write.parquet(DATASET.processed_str('nodes_Persons'), mode='overwrite')
df_nodes_genres.write.parquet(DATASET.processed_str('nodes_Genres'), mode='overwrite')
df_nodes_keywords.write.parquet(DATASET.processed_str('nodes_Keywords'), mode='overwrite')
df_nodes_movies.write.parquet(DATASET.processed_str('nodes_Movies'), mode='overwrite')

df_edges_acted_in.write.parquet(DATASET.processed_str('edges_ACTED_IN'), mode='overwrite')
df_edges_directed.write.parquet(DATASET.processed_str('edges_DIRECTED'), mode='overwrite')
df_edges_has_genre.write.parquet(DATASET.processed_str('edges_HAS_GENRE'), mode='overwrite')
df_edges_has_keyword.write.parquet(DATASET.processed_str('edges_HAS_KEYWORD'), mode='overwrite')

In [63]:
from datasets.build_schema import build_schema

build_schema(
    spark,
    name=str(DATASET),
    nodes=[
        ('Person', DATASET.processed_str('nodes_Persons')),
        ('Genre', DATASET.processed_str('nodes_Genres')),
        ('Keyword', DATASET.processed_str('nodes_Keywords')),
        ('Movie', DATASET.processed_str('nodes_Movies')),
    ],
    edges=[
        ('ActedIn', 'Person', 'Movie', DATASET.processed_str('edges_ACTED_IN')),
        ('Directed', 'Person', 'Movie', DATASET.processed_str('edges_DIRECTED')),
        ('HasGenre', 'Movie', 'Genre', DATASET.processed_str('edges_HAS_GENRE')),
        ('HasKeyword', 'Movie', 'Keyword', DATASET.processed_str('edges_HAS_KEYWORD')),
    ]
)

[2022-01-19 16:48:13,709][/dd_volume/Development/Python/Thesis/code/datasets/datasets/build_schema.py][DEBUG] Merging old schema for imdb-5000-movie-dataset


DatasetSchema(name='imdb-5000-movie-dataset', prefix='Imdb_5000MovieDataset', database='imdb-5000-movie-dataset', description='None', nodes=[NodeSchema(path='data/processed/imdb-5000-movie-dataset/nodes_Persons', properties=[Property(name='name', type='string', ignore=False, label=True, timestamp=False), Property(name='facebook_likes', type='int', ignore=False, label=False, timestamp=False), Property(name='id', type='string', ignore=False, label=False, timestamp=False)], label='Person'), NodeSchema(path='data/processed/imdb-5000-movie-dataset/nodes_Genres', properties=[Property(name='name', type='string', ignore=False, label=True, timestamp=False), Property(name='id', type='string', ignore=False, label=False, timestamp=False)], label='Genre'), NodeSchema(path='data/processed/imdb-5000-movie-dataset/nodes_Keywords', properties=[Property(name='name', type='string', ignore=False, label=True, timestamp=False), Property(name='count', type='long', ignore=True, label=False, timestamp=False), 