In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [7]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('imdb-5000-movie-dataset')
DATASET.save_schema()

In [8]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/12 11:44:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
df = (
    spark.read.csv(DATASET.raw_str('movie_metadata.csv'), header=True, inferSchema=True)
        .withColumn('imdb_id',
                    F.regexp_extract('movie_imdb_link', 'http:\/\/www.imdb.com\/title\/([A-z0-9]+)\/(.*)', 1))
        .withColumn('plot_keywords', F.split('plot_keywords', '\|'))
        .withColumn('genres', F.split('genres', '\|'))
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
df.head(5)

22/03/12 11:44:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(color='Color', director_name='James Cameron', num_critic_for_reviews=723, duration=178, director_facebook_likes=0, actor_3_facebook_likes=855, actor_2_name='Joel David Moore', actor_1_facebook_likes=1000, gross=760505847, genres=['Action', 'Adventure', 'Fantasy', 'Sci-Fi'], actor_1_name='CCH Pounder', movie_title='Avatar\xa0', num_voted_users=886204, cast_total_facebook_likes=4834, actor_3_name='Wes Studi', facenumber_in_poster=0, plot_keywords=['avatar', 'future', 'marine', 'native', 'paraplegic'], movie_imdb_link='http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1', num_user_for_reviews=3054, language='English', country='USA', content_rating='PG-13', budget=237000000, title_year=2009, actor_2_facebook_likes=936, imdb_score=7.9, aspect_ratio=1.78, movie_facebook_likes=33000, imdb_id='tt0499549', id=0),
 Row(color='Color', director_name='Gore Verbinski', num_critic_for_reviews=302, duration=169, director_facebook_likes=563, actor_3_facebook_likes=1000, actor_2_name='Orlando Bloo

In [10]:
df_nodes_persons = (
    df.select(F.col('actor_1_name').alias('name'), F.col('actor_1_facebook_likes').alias('facebook_likes'))
        .union(df.select(F.col('actor_2_name').alias('name'), F.col('actor_2_facebook_likes').alias('facebook_likes')))
        .union(df.select(F.col('actor_3_name').alias('name'), F.col('actor_3_facebook_likes').alias('facebook_likes')))
        .union(
        df.select(F.col('director_name').alias('name'), F.col('director_facebook_likes').alias('facebook_likes')))
        .dropDuplicates(['name'])
        .filter(F.col('name').isNotNull())
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_persons.count())
df_nodes_persons.head(5)

                                                                                

8491


[Row(name='Doug Walker', facebook_likes=131, id=0),
 Row(name='Oliver Platt', facebook_likes=1000, id=1),
 Row(name='Snoop Dogg', facebook_likes=881, id=2),
 Row(name='Stephen Root', facebook_likes=939, id=3),
 Row(name='Laurence Olivier', facebook_likes=1000, id=4)]

In [11]:
df_nodes_genres = (
    df.select(F.explode('genres').alias('name'))
        .dropDuplicates(['name'])
        .filter(F.col('name').isNotNull())
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_genres.count())
df_nodes_genres.head(5)

26


[Row(name='Action', id=0),
 Row(name='Adventure', id=1),
 Row(name='Fantasy', id=2),
 Row(name='Sci-Fi', id=3),
 Row(name='Thriller', id=4)]

In [12]:
df_nodes_movies = (
    df.withColumn('name', F.col('movie_title'))
        .drop(
        'genres', 'movie_title',
        'actor_1_name', 'actor_1_facebook_likes',
        'actor_2_name', 'actor_2_facebook_likes',
        'actor_3_name', 'actor_3_facebook_likes',
        'director_name', 'director_facebook_likes',
        'movie_imdb_link'
    ).withColumn('timestamp', F.col('title_year'))
        .filter(F.col('imdb_id').isNotNull())
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
)
print(df_nodes_movies.count())
df_nodes_movies.head(5)

5043


[Row(color='Color', num_critic_for_reviews=723, duration=178, gross=760505847, num_voted_users=886204, cast_total_facebook_likes=4834, facenumber_in_poster=0, plot_keywords=['avatar', 'future', 'marine', 'native', 'paraplegic'], num_user_for_reviews=3054, language='English', country='USA', content_rating='PG-13', budget=237000000, title_year=2009, imdb_score=7.9, aspect_ratio=1.78, movie_facebook_likes=33000, imdb_id='tt0499549', id=0, name='Avatar\xa0', timestamp=2009),
 Row(color='Color', num_critic_for_reviews=302, duration=169, gross=309404152, num_voted_users=471220, cast_total_facebook_likes=48350, facenumber_in_poster=0, plot_keywords=['goddess', 'marriage ceremony', 'marriage proposal', 'pirate', 'singapore'], num_user_for_reviews=1238, language='English', country='USA', content_rating='PG-13', budget=300000000, title_year=2007, imdb_score=7.1, aspect_ratio=2.35, movie_facebook_likes=0, imdb_id='tt0449088', id=1, name="Pirates of the Caribbean: At World's End\xa0", timestamp=20

In [13]:
df_edges_acted_in = (
    df
        .select(
        F.col('id').alias('movie_id'),
        F.col('actor_1_name').alias('actor_name'),
        F.col('title_year').alias('timestamp')
    )
        .union(df.select(
        F.col('id').alias('movie_id'),
        F.col('actor_2_name').alias('actor_name'),
        F.col('title_year').alias('timestamp')
    ))
        .union(df.select(
        F.col('id').alias('movie_id'),
        F.col('actor_3_name').alias('actor_name'),
        F.col('title_year').alias('timestamp')
    ))
        .dropDuplicates(['movie_id', 'actor_name', 'timestamp'])
        .join(
        df_nodes_persons.select([F.col('id').alias('actor_id'), 'name']),
        F.col('actor_name') == F.col('name'),
        'inner'
    )
        .select(
        F.col('actor_id').alias('src'),
        F.col('movie_id').alias('dst'),
        'timestamp'
    )
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
)
df_edges_acted_in.head(5)

[Row(src=1716, dst=80, timestamp=2010),
 Row(src=717, dst=244, timestamp=2016),
 Row(src=1421, dst=447, timestamp=2005),
 Row(src=1672, dst=700, timestamp=2015),
 Row(src=731, dst=708, timestamp=2011)]

In [14]:
df_edges_directed = (
    df.select(
        F.col('id').alias('movie_id'),
        F.col('director_name').alias('director_name'),
        F.col('title_year').alias('timestamp')
    )
        .dropDuplicates(['movie_id', 'director_name'])
        .join(
        df_nodes_persons.select([F.col('id').alias('person_id'), 'name']),
        F.col('director_name') == F.col('name'),
        'inner'
    )
        .select(
        F.col('person_id').alias('src'),
        F.col('movie_id').alias('dst'),
        'timestamp'
    )
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
)
df_edges_directed.head(5)

[Row(src=0, dst=4, timestamp=None),
 Row(src=49, dst=3416, timestamp=2005),
 Row(src=59, dst=2896, timestamp=2002),
 Row(src=59, dst=2652, timestamp=2007),
 Row(src=80, dst=2603, timestamp=1982)]

In [15]:
df_edges_has_genre = (
    df.select(
        F.col('id').alias('movie_id'),
        F.explode('genres').alias('genre'),
        F.col('title_year').alias('timestamp')
    )
        .dropDuplicates(['movie_id', 'genre'])
        .join(
        df_nodes_genres.select([F.col('id').alias('genre_id'), 'name']),
        F.col('genre') == F.col('name'),
        'inner'
    )
        .select(
        F.col('movie_id').alias('src'),
        F.col('genre_id').alias('dst'),
        'timestamp'
    )
        .filter(F.col('src').isNotNull())
        .filter(F.col('dst').isNotNull())
)
df_edges_has_genre.head(5)

[Row(src=0, dst=0, timestamp=2009),
 Row(src=0, dst=1, timestamp=2009),
 Row(src=0, dst=2, timestamp=2009),
 Row(src=0, dst=3, timestamp=2009),
 Row(src=1, dst=0, timestamp=2007)]

## Feature Engineering

In [16]:
df_kw_doc = (
    df.select(
        F.col('id').alias('movie_id'),
        F.explode('plot_keywords').alias('keyword')
    )
        .withColumn('keyword', F.lower(F.col('keyword')))
)

In [17]:
df_kw = (
    df_kw_doc
        .groupby('keyword')
        .count()
        .sort(F.col('count').desc())
)
print(df_kw.count())
df_kw.head(5)

8086


[Row(keyword='love', count=198),
 Row(keyword='friend', count=166),
 Row(keyword='murder', count=161),
 Row(keyword='death', count=132),
 Row(keyword='police', count=126)]

In [18]:
k = 80
top_keywords = [x.keyword for x in df_kw.limit(k).select('keyword').collect()]
print(top_keywords)

['love', 'friend', 'murder', 'death', 'police', 'new york city', 'high school', 'alien', 'school', 'boy', 'fbi', 'revenge', 'friendship', 'drugs', 'prison', 'money', 'marriage', 'female protagonist', 'island', 'dog', 'party', 'escape', 'wedding', 'sex', 'serial killer', 'detective', 'box office flop', 'rescue', 'teenager', 'female nudity', 'battle', 'lawyer', 'vomiting', 'hospital', 'secret', 'christmas', 'best friend', 'scientist', 'cia', 'singer', 'train', 'college', 'hotel', 'fight', 'future', 'vampire', 'small town', 'writer', 'king', 'texas', 'terrorist', 'teacher', 'male nudity', 'girl', 'cult film', 'student', 'violence', 'spy', 'president', 'assassin', 'sequel', 'doctor', 'blood', 'actor', 'ghost', 'sheriff', 'new york', 'baby', 'desert', 'breasts', 'gangster', 'bar', 'critically bashed', 'monster', 'soldier', 'magic', 'neighbor', 'family relationships', 'one word title', 'coach']


In [19]:
df_nodes_person_kw = (
    df_edges_directed.union(df_edges_acted_in).select(
        F.col('src').alias('person_id'),
        F.col('dst').alias('dst_movie_id'),
    ).join(
        df_kw_doc.filter(df_kw_doc.keyword.isin(top_keywords)),
        F.col('dst_movie_id') == F.col('movie_id'),
        'left'
    ).groupby('person_id').agg(F.collect_set('keyword').alias('keywords'))
)
df_nodes_person_kw.head(5)

[Row(person_id=0, keywords=[]),
 Row(person_id=1, keywords=['friendship', 'vomiting', 'terrorist', 'doctor', 'dog', 'student', 'murder', 'cia', 'new york', 'sex', 'sheriff', 'new york city', 'boy', 'death', 'love', 'president', 'writer', 'christmas', 'neighbor']),
 Row(person_id=2, keywords=['family relationships', 'police', 'sex']),
 Row(person_id=3, keywords=['lawyer', 'desert', 'ghost', 'detective', 'sheriff', 'death', 'money', 'box office flop', 'baby', 'family relationships', 'texas', 'hotel', 'neighbor']),
 Row(person_id=4, keywords=['death', 'love', 'scientist'])]

In [20]:
df_nodes_person_feats = (
    df_nodes_person_kw.select(['person_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k).replace(' ', '_'))
        for k in top_keywords
    ])
)
df_nodes_person_feats.head(5)

[Row(person_id=0, feat_love=False, feat_friend=False, feat_murder=False, feat_death=False, feat_police=False, feat_new_york_city=False, feat_high_school=False, feat_alien=False, feat_school=False, feat_boy=False, feat_fbi=False, feat_revenge=False, feat_friendship=False, feat_drugs=False, feat_prison=False, feat_money=False, feat_marriage=False, feat_female_protagonist=False, feat_island=False, feat_dog=False, feat_party=False, feat_escape=False, feat_wedding=False, feat_sex=False, feat_serial_killer=False, feat_detective=False, feat_box_office_flop=False, feat_rescue=False, feat_teenager=False, feat_female_nudity=False, feat_battle=False, feat_lawyer=False, feat_vomiting=False, feat_hospital=False, feat_secret=False, feat_christmas=False, feat_best_friend=False, feat_scientist=False, feat_cia=False, feat_singer=False, feat_train=False, feat_college=False, feat_hotel=False, feat_fight=False, feat_future=False, feat_vampire=False, feat_small_town=False, feat_writer=False, feat_king=Fals

In [21]:
df_nodes_persons_new = df_nodes_persons.join(
    df_nodes_person_feats,
    F.col('id') == F.col('person_id'),
    'left'
).drop('person_id').sort('id')

df_nodes_persons_new.head(5)

[Row(name='Doug Walker', facebook_likes=131, id=0, feat_love=False, feat_friend=False, feat_murder=False, feat_death=False, feat_police=False, feat_new_york_city=False, feat_high_school=False, feat_alien=False, feat_school=False, feat_boy=False, feat_fbi=False, feat_revenge=False, feat_friendship=False, feat_drugs=False, feat_prison=False, feat_money=False, feat_marriage=False, feat_female_protagonist=False, feat_island=False, feat_dog=False, feat_party=False, feat_escape=False, feat_wedding=False, feat_sex=False, feat_serial_killer=False, feat_detective=False, feat_box_office_flop=False, feat_rescue=False, feat_teenager=False, feat_female_nudity=False, feat_battle=False, feat_lawyer=False, feat_vomiting=False, feat_hospital=False, feat_secret=False, feat_christmas=False, feat_best_friend=False, feat_scientist=False, feat_cia=False, feat_singer=False, feat_train=False, feat_college=False, feat_hotel=False, feat_fight=False, feat_future=False, feat_vampire=False, feat_small_town=False, 

In [22]:
df_nodes_movies_kw = (
    df_kw_doc.groupby('movie_id').agg(F.collect_set('keyword').alias('keywords'))
)
df_nodes_movies_kw.head(5)

[Row(movie_id=0, keywords=['future', 'marine', 'native', 'paraplegic', 'avatar']),
 Row(movie_id=1, keywords=['marriage ceremony', 'marriage proposal', 'goddess', 'pirate', 'singapore']),
 Row(movie_id=2, keywords=['terrorist', 'espionage', 'spy', 'sequel', 'bomb']),
 Row(movie_id=3, keywords=['imprisonment', 'terrorist plot', 'deception', 'lawlessness', 'police officer']),
 Row(movie_id=5, keywords=['american civil war', 'mars', 'princess', 'alien', 'male nipple'])]

In [23]:
df_nodes_movies_feats = (
    df_nodes_movies_kw.select(['movie_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k).replace(' ', '_'))
        for k in top_keywords
    ])
)
df_nodes_movies_feats.head(5)

[Row(movie_id=0, feat_love=False, feat_friend=False, feat_murder=False, feat_death=False, feat_police=False, feat_new_york_city=False, feat_high_school=False, feat_alien=False, feat_school=False, feat_boy=False, feat_fbi=False, feat_revenge=False, feat_friendship=False, feat_drugs=False, feat_prison=False, feat_money=False, feat_marriage=False, feat_female_protagonist=False, feat_island=False, feat_dog=False, feat_party=False, feat_escape=False, feat_wedding=False, feat_sex=False, feat_serial_killer=False, feat_detective=False, feat_box_office_flop=False, feat_rescue=False, feat_teenager=False, feat_female_nudity=False, feat_battle=False, feat_lawyer=False, feat_vomiting=False, feat_hospital=False, feat_secret=False, feat_christmas=False, feat_best_friend=False, feat_scientist=False, feat_cia=False, feat_singer=False, feat_train=False, feat_college=False, feat_hotel=False, feat_fight=False, feat_future=True, feat_vampire=False, feat_small_town=False, feat_writer=False, feat_king=False,

In [24]:
df_nodes_movies_new = df_nodes_movies.join(
    df_nodes_movies_feats,
    F.col('id') == F.col('movie_id'),
    'left'
).drop('movie_id').sort('id')

df_nodes_movies_new.head(5)

[Row(color='Color', num_critic_for_reviews=723, duration=178, gross=760505847, num_voted_users=886204, cast_total_facebook_likes=4834, facenumber_in_poster=0, plot_keywords=['avatar', 'future', 'marine', 'native', 'paraplegic'], num_user_for_reviews=3054, language='English', country='USA', content_rating='PG-13', budget=237000000, title_year=2009, imdb_score=7.9, aspect_ratio=1.78, movie_facebook_likes=33000, imdb_id='tt0499549', id=0, name='Avatar\xa0', timestamp=2009, feat_love=False, feat_friend=False, feat_murder=False, feat_death=False, feat_police=False, feat_new_york_city=False, feat_high_school=False, feat_alien=False, feat_school=False, feat_boy=False, feat_fbi=False, feat_revenge=False, feat_friendship=False, feat_drugs=False, feat_prison=False, feat_money=False, feat_marriage=False, feat_female_protagonist=False, feat_island=False, feat_dog=False, feat_party=False, feat_escape=False, feat_wedding=False, feat_sex=False, feat_serial_killer=False, feat_detective=False, feat_box

In [25]:
df_nodes_genres_kw = (
    df_edges_has_genre.select(
        F.col('src').alias('src_movie_id'),
        F.col('dst').alias('genre_id'),
    ).join(
        df_kw_doc.filter(df_kw_doc.keyword.isin(top_keywords)),
        F.col('src_movie_id') == F.col('movie_id'),
        'left'
    ).groupby('genre_id').agg(F.collect_set('keyword').alias('keywords'))
)
df_nodes_genres_kw.head(5)

[Row(genre_id=0, keywords=['friendship', 'doctor', 'girl', 'friend', 'female protagonist', 'fight', 'violence', 'female nudity', 'cia', 'critically bashed', 'revenge', 'breasts', 'sex', 'train', 'prison', 'box office flop', 'escape', 'college', 'writer', 'neighbor', 'desert', 'magic', 'male nudity', 'king', 'student', 'fbi', 'small town', 'soldier', 'murder', 'family relationships', 'teacher', 'future', 'secret', 'detective', 'new york city', 'death', 'spy', 'island', 'bar', 'battle', 'school', 'alien', 'christmas', 'hotel', 'vomiting', 'party', 'terrorist', 'ghost', 'blood', 'scientist', 'gangster', 'drugs', 'vampire', 'hospital', 'monster', 'police', 'wedding', 'singer', 'sheriff', 'cult film', 'rescue', 'president', 'serial killer', 'lawyer', 'dog', 'best friend', 'sequel', 'actor', 'teenager', 'new york', 'texas', 'assassin', 'boy', 'money', 'marriage', 'love', 'baby', 'high school', 'one word title']),
 Row(genre_id=1, keywords=['friendship', 'girl', 'doctor', 'coach', 'friend', '

In [26]:
df_nodes_genres_feats = (
    df_nodes_genres_kw.select(['genre_id'] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k).replace(' ', '_'))
        for k in top_keywords
    ])
)
df_nodes_genres_feats.head(5)

[Row(genre_id=0, feat_love=True, feat_friend=True, feat_murder=True, feat_death=True, feat_police=True, feat_new_york_city=True, feat_high_school=True, feat_alien=True, feat_school=True, feat_boy=True, feat_fbi=True, feat_revenge=True, feat_friendship=True, feat_drugs=True, feat_prison=True, feat_money=True, feat_marriage=True, feat_female_protagonist=True, feat_island=True, feat_dog=True, feat_party=True, feat_escape=True, feat_wedding=True, feat_sex=True, feat_serial_killer=True, feat_detective=True, feat_box_office_flop=True, feat_rescue=True, feat_teenager=True, feat_female_nudity=True, feat_battle=True, feat_lawyer=True, feat_vomiting=True, feat_hospital=True, feat_secret=True, feat_christmas=True, feat_best_friend=True, feat_scientist=True, feat_cia=True, feat_singer=True, feat_train=True, feat_college=True, feat_hotel=True, feat_fight=True, feat_future=True, feat_vampire=True, feat_small_town=True, feat_writer=True, feat_king=True, feat_texas=True, feat_terrorist=True, feat_teac

In [27]:
df_nodes_genres_new = df_nodes_genres.join(
    df_nodes_genres_feats,
    F.col('id') == F.col('genre_id'),
    'left'
).drop('genre_id').sort('id')

df_nodes_genres_new.head(5)

[Row(name='Action', id=0, feat_love=True, feat_friend=True, feat_murder=True, feat_death=True, feat_police=True, feat_new_york_city=True, feat_high_school=True, feat_alien=True, feat_school=True, feat_boy=True, feat_fbi=True, feat_revenge=True, feat_friendship=True, feat_drugs=True, feat_prison=True, feat_money=True, feat_marriage=True, feat_female_protagonist=True, feat_island=True, feat_dog=True, feat_party=True, feat_escape=True, feat_wedding=True, feat_sex=True, feat_serial_killer=True, feat_detective=True, feat_box_office_flop=True, feat_rescue=True, feat_teenager=True, feat_female_nudity=True, feat_battle=True, feat_lawyer=True, feat_vomiting=True, feat_hospital=True, feat_secret=True, feat_christmas=True, feat_best_friend=True, feat_scientist=True, feat_cia=True, feat_singer=True, feat_train=True, feat_college=True, feat_hotel=True, feat_fight=True, feat_future=True, feat_vampire=True, feat_small_town=True, feat_writer=True, feat_king=True, feat_texas=True, feat_terrorist=True, 

## Saving the Data

In [28]:
df_nodes_persons_new.write.parquet(DATASET.processed_str('nodes_Person'), mode='overwrite')
df_nodes_genres_new.write.parquet(DATASET.processed_str('nodes_Genre'), mode='overwrite')
df_nodes_movies_new.write.parquet(DATASET.processed_str('nodes_Movie'), mode='overwrite')

df_edges_acted_in.write.parquet(DATASET.processed_str('edges_ACTED_IN'), mode='overwrite')
df_edges_directed.write.parquet(DATASET.processed_str('edges_DIRECTED'), mode='overwrite')
df_edges_has_genre.write.parquet(DATASET.processed_str('edges_HAS_GENRE'), mode='overwrite')

                                                                                

In [29]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Person', NodeSchema.from_spark(df_nodes_persons_new.schema, label='name'))
        .add_node_schema('Genre', NodeSchema.from_spark(df_nodes_genres_new.schema, label='name'))
        .add_node_schema('Movie', NodeSchema.from_spark(df_nodes_movies_new.schema, label='name', timestamp='timestamp', interaction=False))
        .add_edge_schema('ACTED_IN',
                         EdgeSchema.from_spark(df_edges_acted_in.schema, source_type='Person', target_type='Movie',
                                               directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('DIRECTED',
                         EdgeSchema.from_spark(df_edges_directed.schema, source_type='Person', target_type='Movie',
                                               directed=True, timestamp='timestamp', interaction=False))
        .add_edge_schema('HAS_GENRE',
                         EdgeSchema.from_spark(df_edges_has_genre.schema, source_type='Movie', target_type='Genre',
                                               directed=True, timestamp='timestamp', interaction=False))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/data/pella/projects/University/Thesis/Thesis/code/storage/datasets/processed/imdb-5000-movie-dataset'), nodes={'Person': NodeSchema(_type='Person', _schema=..., label='name', properties={'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'facebook_likes': GraphProperty(_name='facebook_likes', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.INT: 'int'>, array=False)), 'feat_love': GraphProperty(_name='feat_love', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_friend': GraphProperty(_name='feat_friend', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_murder': GraphProperty(_name='feat_murder', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_death': GraphProperty(_name='feat_death', dtype=DType(atomic=<DTypeAtomic.BOOL: 'boolean'>, array=False)), 'feat_police': Grap