# Part 00: Notebook overview ..

In [None]:

#  This Notebook is part of a set that demonstrate GNN using a movie dataset.
#  About this Notebook,
#
#  .  There was a Kaggle GNN challenge circa 2019 detailed here,
#        https://www.kaggle.com/c/movie-genre-classification/data
#
#     That data is locked down, but a similarly themed dataset also on 
#     Kaggle is here,
#        https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset
#
#     The above is the data set in use here.
#
#     In this first NoteBook, basically we mung input data, and create
#     graphs.
#
#
#  .  1 GB, plus or minus. Most of that volume comes from reviews. The
#     movies run 30-40 MB, and the cast and crew about 190 MB.
#
#     The data is CSV, with embedded arrays of JSON.
#     To remove dependencies on GS/S3 hosted data, this program expects this
#     data to be local to the container hosting this Jupyter Notebook.
#
#     Since we host all of our assets on GitHub, and GitHub has a 25 MB file
#     size limit, the total data set is now split across multiple files.
#
#  .  The existing schema for just Movies is listed here,
#  
#        10_movies_metadata.csv
#        -----------------------------------------
#           adult                      ..   False
#           belongs_to_collection      ..
#           budget                     ..   2700000
#           genres                     ..   "[{'id': 35, 'name': 'Comedy'}]"
#           homepage                   ..   http://www.animalhouse.com/
#           id                         ..   8469
#           imdb_id                    ..   tt0077975
#           original_language          ..   en
#           original_title             ..   Animal House
#           overview                   ..   "At a 1962 College, Dean Vernon Wormer is determined to expel
#                                            the entire Delta Tau Chi Fraternity, but those troublemakers
#                                            have other plans for him."
#           popularity                 ..   7.525382
#           poster_path                ..   /AuJkgAh7zAGsm7Oo3CGyDtYvzg0.jpg
#           production_companies       ..   "[{'name': 'Universal Pictures', 'id': 33}, {'name': 'Oregon Film Factory',
#                                               'id': 13298}, {'name': 'Stage III Productions', 'id': 13300}]"
#           production_countries       ..   "[{'iso_3166_1': 'US', 'name': 'United States of America'}]"
#           release_date               ..   1978-07-27
#           revenue                    ..   141000000
#           runtime                    ..   109.0
#           spoken_languages           ..   "[{'iso_639_1': 'en', 'name': 'English'}]"
#           status                     ..   Released
#           tagline                    ..   It was the Deltas against the rules... the rules lost!
#           title                      ..   Animal House
#           video                      ..   False
#           vote_average               ..   7.0
#           vote_count                 ..   420
#
#     From the above, we load the following into a DataFrame of Movies nodes,
#
#           id                         ..   8469
#           title                      ..   Animal House
#           genres                     ..   "[{'id': 35, 'name': 'Comedy'}]"
#           overview                   ..   "At a 1962 College, Dean Vernon Wormer is determined to expel
#                                            the entire Delta Tau Chi Fraternity, but those troublemakers
#                                            have other plans for him."
#           tagline                    ..   It was the Deltas against the rules... the rules lost!
#
#           popularity                 ..   7.525382
#           production_companies       ..   "[{'name': 'Universal Pictures', 'id': 33}, {'name': 'Oregon Film Factory',
#                                               'id': 13298}, {'name': 'Stage III Productions', 'id': 13300}]"
#           release_date               ..   1978-07-27
#           revenue                    ..   141000000
#           runtime                    ..   109.0
#           spoken_languages           ..   "[{'iso_639_1': 'en', 'name': 'English'}]"
#           vote_average               ..   7.0
#           vote_count                 ..   420
#
#      Notice the following from above,
#
#         ..  genres is an array of JSON, with each genre being unique identified via a numeric.
#             We will take the first genre and put it into a property on each node titled, primary_genre.
#         ..  We will leave all remaining JSON untouched, stored as strings.


#  .  The existing schema is for Keywords is listed here,
#
#        11_keywords.csv
#        -----------------------------------------
#           id                         ..   8469
#           keywords                   ..   "[{'id': 572, 'name': 'sex'}, {'id': 2483, 'name': 'nudity'},
#                                             {'id': 3616, 'name': 'college'}, {'id': 157632, 'name': 'fraternity'},
#                                             {'id': 158507, 'name': 'gross out comedy'}, {'id': 160450, 'name': 'dean'},
#                                             {'id': 171400, 'name': 'fraternity house'}, {'id': 208983, 'name': 'probation'},
#                                             {'id': 208992, 'name': '1960s'}, {'id': 209506, 'name': 'college freshman'},
#                                             {'id': 236316, 'name': 'anarchic comedy'}]"
#
#      From the above, the following is offered,
#   
#         ..  id  joins with  movie.id
#         ..  keywords.id  already enumerates keywords associated with the movies for us.
#             Super handy.


#  .  We also have data for,
# 
#        ..  12_Credits  (split into; Cast, Crew)
#        ..  14|15_Ratings
#        ..  16|17_(External) Links
#
#     And will likely add these at a later date.


#  Below we continue by loading the raw data, and performing some validations on
#  statements made, assumptions, and similar.



#  Part 01: Load just Movies into a DataFrame, perform basic/sanity-check analysis 

Enter:  

   - (Nothing)
   - (Data files local on disk)
   
Exit:

   - Boolean :: MY_DEBUG
   - DF :: df_movies
   

In [4]:

#  Setting display options, and a flag for outputting more information
#

import pandas as pd
   #
pd.set_option("display.width", 480)

from tabulate import tabulate


MY_DEBUG = True
# MY_DEBUG = False
   #
print("--")


--


In [2]:

import dask.dataframe as dd
import numpy as np
#  import pandas as pd                         #  Already above
   #
import json


print("--")


--


In [3]:

#  Load DataFrame with raw input data associated with Movies
#

l_InputFiles  = [
   "./02_Files/40_Movies_01.txt",
   "./02_Files/41_Movies_02.txt",
]

#  Import as type string, and handle errors later.
#
df_data = dd.read_csv(
   l_InputFiles,
   delimiter  = ",",
   skiprows   = 1,                                            #  Skip the first line of each file, since it's the column headers
   dtype      = {
      "adult"                     : np.dtype(str),
      "belongs_to_collection"     : np.dtype(str),
      "budget"                    : np.dtype(str),
      "genres"                    : np.dtype(str),
      "homepage"                  : np.dtype(str),
      "id"                        : np.dtype(str),
      "imdb_id"                   : np.dtype(str),
      "original_language"         : np.dtype(str),
      "original_title"            : np.dtype(str),
      "overview"                  : np.dtype(str),
      "popularity"                : np.dtype(str),
      "poster_path"               : np.dtype(str),
      "production_companies"      : np.dtype(str),
      "production_countries"      : np.dtype(str),
      "release_date"              : np.dtype(str),
      "revenue"                   : np.dtype(str),
      "runtime"                   : np.dtype(str),
      "spoken_languages"          : np.dtype(str),
      "status"                    : np.dtype(str),
      "tagline"                   : np.dtype(str),
      "title"                     : np.dtype(str),
      "video"                     : np.dtype(str),
      "vote_average"              : np.dtype(str),
      "vote_count"                : np.dtype(str),
      },
   names      = [
      "adult", "belongs_to_collection", "budget", "genres", "homepage", "id", "imdb_id",
      "original_language", "original_title", "overview", "popularity", "poster_path",
      "production_companies", "production_countries", "release_date", "revenue", "runtime",
      "spoken_languages", "status", "tagline", "title", "video", "vote_average", "vote_count",
      ]
   )   

df_data.compute()

print("--")


--


In [5]:

#  Initial look at the data, sanity check-
#

if (MY_DEBUG):
   print("Count of Movies: %d" % (len(df_data.index)))
      #
   print(tabulate(df_data.head(2), headers='keys', tablefmt='psql'))
   
print("--")

#  Sample output,
#
#  Count of Movies: 45466
#
#  +----+---------+----------------------------------------------------------------------------------------------
#  |    | adult   | belongs_to_collection                                                                       
#  |----+---------+--------------------------------------------------------------------------------------------
#  |  0 | False   | {'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
#     'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'} | 30000000 | [{'id': 16, 'name': 'Animation'},
#     {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]  | http://toystory.disney.com/toy-story |  862 |
#     tt0114709 | en                  | Toy Story        | Led by Woody, Andy's toys live happily in his room until
#        Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots
#        against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put
#        aside their differences.                                                                                             
#     |      21.9469 | /rhIRbceoE9lR4veEXuwCC2wARtG.jpg | [{'name': 'Pixar Animation Studios', 'id': 3}]                                                                                      | [{'iso_3166_1': 'US', 'name': 'United States of America'}] | 1995-10-30     | 373554033 |        81 | [{'iso_639_1': 'en', 'name': 'English'}]                                          | Released | nan                                       | Toy Story | False   |            7.7 |         5415 |


Count of Movies: 45466
+----+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------+----------+---------------------------------------------------------------------------------------------------+--------------------------------------+------+-----------+---------------------+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+-----------------

In [6]:

#  Drop unwanted columns (keep wanted columns)
#

df_movies = df_data[["id", "title", "overview", "tagline", "budget", "genres",
   "popularity", "production_companies", "release_date", "revenue", "runtime",
   "vote_average", "vote_count", ]]

df_movies.compute()
   #
del df_data


if (MY_DEBUG):
   print(tabulate(df_movies.head(2), headers='keys', tablefmt='psql'))

print("--")


+----+------+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+----------+---------------------------------------------------------------------------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------+----------------+-----------+-----------+----------------+--------------+
|    |   id | title     | overview                                                                                                                                                                     

#  Part 02: Check just Genres, a column in Movies

Enter:  

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   
Exit:

   - DF :: df_movies
   - Boolean :: MY_DEBUG


In [7]:

#  Check column type of genres; Eg., how should we process this data
#

if (MY_DEBUG):
   l_cntr = 0
      #
   for l_each in df_movies.itertuples():
      l_cntr += 1
      if (l_cntr < 3):
         print(type(l_each.genres))
         print(     l_each.genres )
        
print("--")

#  Sample output,
#
#     <class 'str'>
#     [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
#     <class 'str'>
#     [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]


<class 'str'>
[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
<class 'str'>
[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]
--


In [8]:

#  . Save the original 'genres' string as JSON, 'genres_json'.
#  . Save the first 'genres.name' as 'genres_primary'.
#  . Save the first 'genres.id'   as 'genres_primary_id'.
#
#  .  Why 'primary' ?
#        Many movies are listed as having many genres. Since we act
#        to demonstrate node prediction, we want fewer/easier node
#        (types).
#
#        Given more data, it might be better to just combine these
#        multiple genres listing into one (hybrid) genre per movie.


#  Effectively, here, we are checking if the string is valid JSON.
#  The proper quotes get munged each time we save, and you will see
#  we effectively run this mung/cast each type when decoding this
#  string.
#
def f_genres_json(i_arg1):
   try:
      l_str1 = str(i_arg1)                                                  #  Needed this, was getting odd  json.loads()  errors otherwise
      l_str2 = l_str1.replace("'", "\"")
      l_str3 = json.loads(l_str2)
      l_return = l_str3
   except:
      l_return = json.loads('[{"id": 99999999, "name": "Unknown"}]')
   return l_return
      #
df_movies["genres_json"      ] = df_movies.genres.map     (lambda x: f_genres_json(x)       )


#  This block can get deleted-
#     Effectively this block is replaced by a better block below.
#     Really just keeping this block for teaching.
#
#  def f_genres_primary(i_arg1):
#     try:
#        l_return = i_arg1[0]["name"]
#     except:
#        l_return = "Unknown"
#     return l_return
#        #
#  def f_genres_primary_id(i_arg1):
#     try:
#        l_return = i_arg1[0]["id"]
#     except:
#        l_return = -1
#     return l_return
#     #
#  df_movies["genres_primary"   ] = df_movies.genres_json.map(lambda x: f_genres_primary   (x) )
#  df_movies["genres_primary_id"] = df_movies.genres_json.map(lambda x: f_genres_primary_id(x) )


def f_primary(i_arg1, i_col):
   try:
      l_return = i_arg1[0][i_col]
   except:
      if (i_col == "id"):
         l_return = -1
      else:
         l_return = "Unknown"
   return l_return
      #
df_movies["genres_primary"   ] = df_movies.genres_json.map(lambda x: f_primary(x, "name") )
df_movies["genres_primary_id"] = df_movies.genres_json.map(lambda x: f_primary(x, "id"  ) )


if (MY_DEBUG):

   l_cntr = 0
      #
   for l_each in df_movies.itertuples():
      l_cntr += 1
         #
      if (l_cntr < 3):
         print(type(l_each))
         print(     l_each )
         print("")
        
#  Sample output,
#
#     <class 'pandas.core.frame.Pandas'>
#     Pandas(Index=0, id='862', title='Toy Story', overview="Led by Woody, Andy's toys live happily in his room until Andy's birthday
#        brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when
#        circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
#     tagline=nan, budget='30000000', genres="[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751,
#        'name': 'Family'}]", popularity='21.946943', production_companies="[{'name': 'Pixar Animation Studios', 'id': 3}]",
#     release_date='1995-10-30', revenue='373554033', runtime='81.0', vote_average='7.7', vote_count='5415', 
#     genres_json=[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}],
#     genres_primary='Animation', genres_primary_id=16)

print("--")


<class 'pandas.core.frame.Pandas'>
Pandas(Index=0, id='862', title='Toy Story', overview="Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.", tagline=nan, budget='30000000', genres="[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]", popularity='21.946943', production_companies="[{'name': 'Pixar Animation Studios', 'id': 3}]", release_date='1995-10-30', revenue='373554033', runtime='81.0', vote_average='7.7', vote_count='5415', genres_json=[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}], genres_primary='Animation', genres_primary_id=16)

<class 'pandas.core.frame.Pandas'>
Pandas(Index=1, id='8844', title='Jumanji', overview="When siblings Jud

In [9]:

#  Analysis on just genres- How many unique values do we have ?
#
#     .  Copy just  Movies.genres_json  into a new DataFrame. 
#     .  Extract all  'Movies.genres_json.name'  from the JSON string into list.
#     .  Pivot this list of genres names into separate rows.
#     .  Count the unique genres names.


df_genres = df_movies[["genres_json"]]


#  Convert the genres_json array of dictionaries into an array of just genres.names
#
def f_genres_arr(i_arg1):
   l_arr  = []
      #
   try:
      for l_each in i_arg1:
         l_name = l_each["name"]
         l_arr += [l_name]
      l_return = l_arr
   except:
      l_return = [ "Unknown" ]
   return l_return
      #
df_genres["genres_names"] = df_genres.genres_json.map(lambda x: f_genres_arr(x), meta=("genres_json", "object"))


#  Count the above with a group by, and sort
#
df_genres2 = df_genres.explode("genres_names")
   #
df_genres3 = df_genres2.groupby("genres_names")["genres_names"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)
    

      ########################################
        
        
#  Output for review
#
if (MY_DEBUG):
   l_cntr = 0
   print("All   Genre entries for all movies ..")
   print("-------------------------------")
      #
   for l_each in df_genres3.itertuples():
      l_cntr += 1
         #
      if (l_cntr < 5):
         print("Genre name: %-48s  %d" % (l_each.genres_names, l_each.count))
           
   print("Total: %d" % (len(df_genres3.index)))
   print("")


      ########################################


#  See how the above differs from df_data2.genres_primary
#
df_genres4 = df_movies.groupby("genres_primary")["genres_primary"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)
    

#  Output for review
#
if (MY_DEBUG):
   l_cntr = 0
   print("First Genre entry   for all movies ..")
   print("-------------------------------")
      #
   for l_each in df_genres4.itertuples():
      l_cntr += 1
         #
      if (l_cntr < 5):
         print("Genre name: %-48s  %d" % (l_each.genres_primary, l_each.count))
    
   print("Total: %d" % (len(df_genres4.index)))
   print("")
    
    
      ########################################
        
        
del df_genres
del df_genres2
del df_genres3
del df_genres4
   #
print("--")


#  Sample output,
#
#     Genre name: Drama                                             20265
#     Genre name: Comedy                                            13182
#     Genre name: Thriller                                          7624
#     Genre name: Romance                                           6735
#     Genre name: Action                                            6596
#     Genre name: Horror                                            4673
#     Genre name: Crime                                             4307
#     Genre name: Documentary                                       3932
#     Genre name: Adventure                                         3496
#     Genre name: Science Fiction                                   3049
#     Genre name: Family                                            2770
#     Genre name: Mystery                                           2467
#     Genre name: Fantasy                                           2313
#     Genre name: Animation                                         1935
#     Genre name: Foreign                                           1622
#     Genre name: Music                                             1598
#     Genre name: History                                           1398
#     Genre name: War                                               1323
#     Genre name: Western                                           1042
#     Genre name: TV Movie                                          767
#     Genre name: Odyssey Media                                     1
#     Genre name: Pulser Productions                                1
#     Genre name: Rogue State                                       1
#     Genre name: Vision View Entertainment                         1
#     Genre name: Mardock Scramble Production Committee             1
#     Genre name: Telescene Film Group Productions                  1
#     Genre name: Sentai Filmworks                                  1
#     Genre name: GoHands                                           1
#     Genre name: Carousel Productions                              1
#     Genre name: BROSTA TV                                         1
#     Genre name: Aniplex                                           1
#     Genre name: The Cartel                                        1
#     Total: 32
#     
#     Genre name: Drama                                             11966
#     Genre name: Comedy                                            8820
#     Genre name: Action                                            4489
#     Genre name: Documentary                                       3415
#     Genre name: Horror                                            2619
#     Genre name: Unknown                                           2442
#     Genre name: Crime                                             1685
#     Genre name: Thriller                                          1665
#     Genre name: Adventure                                         1514
#     Genre name: Romance                                           1191
#     Genre name: Animation                                         1124
#     Genre name: Fantasy                                           704
#     Genre name: Science Fiction                                   647
#     Genre name: Mystery                                           554
#     Genre name: Family                                            524
#     Genre name: Music                                             487
#     Genre name: Western                                           451
#     Genre name: TV Movie                                          390
#     Genre name: War                                               379
#     Genre name: History                                           279
#     Genre name: Foreign                                           118
#     Genre name: Carousel Productions                              1
#     Genre name: Aniplex                                           1
#     Genre name: Odyssey Media                                     1
#     Total: 24


All   Genre entries for all movies ..
-------------------------------
Genre name: Drama                                             20265
Genre name: Comedy                                            13182
Genre name: Thriller                                          7624
Genre name: Romance                                           6735
Total: 32

First Genre entry   for all movies ..
-------------------------------
Genre name: Drama                                             11966
Genre name: Comedy                                            8820
Genre name: Action                                            4489
Genre name: Documentary                                       3415
Total: 24

--


# Part 03: Further checks, corrections

Enter:  

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   
Exit:

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   

In [10]:

#  Is movies.id  present, numeric ?
#
#     We are going to use this as our key, and should check it first.

if (MY_DEBUG):
   l_cntr = 0
   l_fail = 0
      #
   for l_each in df_movies.itertuples():
      l_cntr += 1
      try:
         l_xxx   = int(l_each.id)
      except:
         print("Movie with bad id value: %s" % (l_each.id) )
         l_fail+= 1

   print("")
   print("Number of total Movies: %d  Number with numeric id: %d   Number with a non-numeric id: %d" % (l_cntr, (l_cntr - l_fail), l_fail ))
   print("")


#  Filter out those 'bad' movie id values
#
df_movies = df_movies[df_movies.id.str.isnumeric()]

#  Add a 'LABEL' column, used by our graph
#
df_movies = df_movies.assign(LABEL=lambda x: "Movie")


if (MY_DEBUG):
   l_cntr = 0
   l_fail = 0
      #
   for l_each in df_movies.itertuples():
      l_cntr += 1
      try:
         l_xxx   = int(l_each.id)
      except:
         print("Movie with bad id value: %s" % (l_each.id) )
         l_fail+= 1

   print("Number of total Movies: %d  Number with numeric id: %d   Number with a non-numeric id: %d" % (l_cntr, (l_cntr - l_fail), l_fail ))


print("--")
    
    
#  Sample output,
#
#     Movie with bad id value: 1997-08-20
#     Movie with bad id value: 2012-09-29
#     Movie with bad id value: 2014-01-01
#
#     Number of total Movies: 45466  Number with numeric id: 45463   Number with a non-numeric id: 3
#     
#     Number of total Movies: 45463  Number with numeric id: 45463   Number with a non-numeric id: 0


Movie with bad id value: 1997-08-20
Movie with bad id value: 2012-09-29
Movie with bad id value: 2014-01-01

Number of total Movies: 45466  Number with numeric id: 45463   Number with a non-numeric id: 3

Number of total Movies: 45463  Number with numeric id: 45463   Number with a non-numeric id: 0
--


In [11]:

#  Later we will (classify, node label predict) on; title, tagline, overview
#  As such, we want to be certain these are present (effectively, not null)
#

def f_checker1():
   l_cntr1 = 0
   l_cntr2 = 0
   l_cntr3 = 0
      #
   for l_each in df_movies.itertuples():
      if not (isinstance(l_each.title   , str)):
         l_cntr1 += 1
      if not (isinstance(l_each.tagline , str)):
         l_cntr2 += 1
      if not (isinstance(l_each.overview, str)):
         l_cntr3 += 1
      #        
   print("Num of bad titles: %-6d   Bad taglines: %-6d   Bad overviews: %-6d" % (l_cntr1, l_cntr2, l_cntr3))


f_checker1()

df_movies = df_movies.assign(
   title    = lambda x: x.title   .fillna("Unknown").astype(str),
   tagline  = lambda x: x.tagline .fillna("Unknown").astype(str),
   overview = lambda x: x.overview.fillna("Unknown").astype(str),
   )

f_checker1()

print("--")

#  Sample output,
#
#     Num of bad titles: 3        Bad taglines: 25051    Bad overviews: 954   
#     Num of bad titles: 0        Bad taglines: 0        Bad overviews: 0 


Num of bad titles: 3        Bad taglines: 25051    Bad overviews: 954   
Num of bad titles: 0        Bad taglines: 0        Bad overviews: 0     
--


#  Part 00: Checkpoint our current state

Enter:  

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   
Exit:

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   

In [None]:

#  We have a DataFrame titled,  df_movies  with the following features,
#
#     id                         ..   8469
#     title                      ..   Animal House
#     overview                   ..   "At a 1962 College, Dean Vernon Wormer is determined to expel
#                                      the entire Delta Tau Chi Fraternity, but those troublemakers
#                                      have other plans for him."
#     tagline                    ..   It was the Deltas against the rules... the rules lost!
#     budget                     ..   2700000
#     genres                     ..   "[{'id': 35, 'name': 'Comedy'}]"
#     popularity                 ..   7.525382
#     production_companies       ..   "[{'name': 'Universal Pictures', 'id': 33}, {'name': 'Oregon Film Factory',
#                                         'id': 13298}, {'name': 'Stage III Productions', 'id': 13300}]"
#     release_date               ..   1978-07-27
#     revenue                    ..   141000000
#     runtime                    ..   109.0
#     vote_average               ..   7.0
#     vote_count                 ..   420
#
#     genres_json                ..   (same as above, case as JSON/dictionary)
#     genres_primary             ..   Just the first genres.name, a string
#     genres_primary_id          ..   Just the first genres.id, an integer
    
    
#  Currently, one use case for GNN requires a bi-partitite graph. We have additional data
#  sets for,
#
#        11_keywords.csv
#        -----------------------------------------
#           id                         ..   8469
#           keywords                   ..   "[{'id': 572, 'name': 'sex'}, {'id': 2483, 'name': 'nudity'},
#                                             {'id': 3616, 'name': 'college'}, {'id': 157632, 'name': 'fraternity'},
#                                             {'id': 158507, 'name': 'gross out comedy'}, {'id': 160450, 'name': 'dean'},
#                                             {'id': 171400, 'name': 'fraternity house'}, {'id': 208983, 'name': 'probation'},
#                                             {'id': 208992, 'name': '1960s'}, {'id': 209506, 'name': 'college freshman'},
#                                             {'id': 236316, 'name': 'anarchic comedy'}]"
#
#      From the above, the following is offered,
#   
#         ..  id  joins with movie.id
#         ..  keywords.id  already enumerates keywords associated with the movies for us.
#             Super handy.
#
#  .  We also have data for,
# 
#        ..  12_Credits  (split into; Cast, Crew)
#        ..  14|15_Ratings
#        ..  16|17_(External) Links


#  From here, we proceed with just  keywords
#


# Part 04: Work on Keywords, which also gives us the Edges

Enter:  

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   
Exit:

   - DF :: df_movies
   - DF :: df_keywords
   - DF :: df_edges
   - Boolean :: MY_DEBUG
   

In [12]:

#  Load DataFrame with raw input data. This time we are looking at keywords.
#

l_InputFiles  = [
   "./02_Files/50_Keywords_00.txt",
]

df_data = dd.read_csv(
   l_InputFiles,
   delimiter  = ",",
   skiprows   = 1,                                            #  Skip the first line of each file, since it's the column headers
   dtype      = {
      "id"                        : np.dtype(str),
      "keywords"                  : np.dtype(str),            #  In the source CSV, this column was titled 'values', a bad idea
      },
   names      = [
      "id", "keywords"
      ]
   )   

df_data.compute()

print("--")


--


In [13]:

#  Initial look at the data, sanity check-
#

MY_DEBUG = True


if(MY_DEBUG):
   print("Number of keywords: %d" % len(df_data.index))
      #
   print(tabulate(df_data.head(2), headers='keys', tablefmt='psql'))


   l_fail = 0
      #
   for l_each in df_data.itertuples():
      if not (isinstance(l_each.id, int)):
         print("Keyword with bad id value: %s" % (l_each.id) )
         l_fail+= 1

   print("")
      #
   print("Number of total Keywords: %d  Number with numeric id: %d   Number with a non-numeric id: %d" % (l_cntr, (l_cntr - l_fail), l_fail ))


print("--")


#  Sample output,
#
#     46419
#     +----+------+------------------------------------------------------------------------------------------------------------------------------------
#     |    |   id | keywords                                                                                                                                                                                                                                                                                                                   |
#     |----+------+--------------------------------------------------------------------------------------------------------------------------------
#     |  0 |  862 | [{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'},  ...
#     |  1 | 8844 | [{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'name': "based on children's book"},  ...
#     +----+------+------------------------------------------------------------------------------------------------------------------------------------
#
#     Number of total Keywords: 46419  Number with numeric id: 46419   Number with a non-numeric id: 0


Number of keywords: 46419
+----+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    |   id | keywords                                                                                                                                                                                                                                                                                                                   |
|----+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:

#  In  df_data  (our Keywords), is a column titled, 'keywords'.
#  This is an array of dictionaries that we will use for many purposes.
#
#  As such, here, cast and copy it out to a new column titled,
#  'keywords_json'.
#

def f_keywords_json(i_arg1):
   try:
      l_str1 = str(i_arg1)                                                  #  Needed this, was getting odd  json.loads()  errors otherwise
      l_str2 = l_str1.replace("'", "\"")
      l_str3 = json.loads(l_str2)
      l_return = l_str3
   except:
      l_return = json.loads('[{"id": 99999999, "name": "Unknown"}]')
   return l_return

df_data["keywords_json"] = df_data.keywords.map(lambda x: f_keywords_json(x) )
    
print("--")


In [None]:

#  Build a new DataFrame with the unique Keywords. 
#
#  We'll use this to count, but also this can be our list of nodes for the graph
#  with Label, Keywords.


#  Here is our pivot, and drop duplicates
#
df_data2 = df_data[["keywords_json"]]
df_data3 = df_data2.explode("keywords_json")
   #
df_data3["keywords_str"] = df_data3.keywords_json.map(lambda x: str(x) )
   #
df_data4 =  df_data3[["keywords_str"]].drop_duplicates()
    
    
print("Number of input records: %-8d   When pivoted out: %-8d   When de-duplicated: %-8d" % ( len(df_data2.index), len(df_data3.index), len(df_data4.index) ))
    
    
if (MY_DEBUG):
   l_cntr = 0
   print("")
      #
   for l_each in df_data4.itertuples():
      l_cntr += 1
         #
      if (l_cntr < 3):
         print(l_each)   
            
    
def f_get(i_arg1, i_col):
   try:
      l_str1   = i_arg1
      l_str2   = l_str1.replace("'", "\"")
      l_str3   = json.loads(l_str2)
      l_return = l_str3[i_col]
   except:
      if (i_col == "id"):
         l_return = -1
      else:
         l_return = "Unknown"
   return l_return


#  These columns are currently embedded in a dictionary. Pull them up a
#  level; ease of use later
#
df_data4["id"  ] = df_data4.keywords_str.map(lambda x: f_get(x, "id"  ) )
df_data4["name"] = df_data4.keywords_str.map(lambda x: f_get(x, "name") )
   #
df_keywords = df_data4[["id", "name"]]
df_keywords = df_keywords.assign(LABEL=lambda x: "Keyword")


if (MY_DEBUG):
   l_cntr = 0
   print("")
      #
   for l_each in df_keywords.itertuples():
      l_cntr += 1
         #
      if (l_cntr < 3):
         print(l_each) 


del  df_data2
del  df_data3
del  df_data4

print("--")

#  Sample output,
#
#     Number of input records: 46419      When pivoted out: 168018     When de-duplicated: 19424   
#     
#     Pandas(Index=0, keywords_str="{'id': 931, 'name': 'jealousy'}")
#     Pandas(Index=0, keywords_str="{'id': 4290, 'name': 'toy'}")
#     
#     Pandas(Index=0, id=931, name='jealousy')
#     Pandas(Index=0, id=4290, name='toy')


In [None]:

#  We need a different pivot to build the edge between Movies and Keywords.
#
#     (Basically, we use the outer key/id, which points to movies. The id
#      we were processing above was the key/id for keyword.)
#

df_data2 = df_data[["id", "keywords_json"]]
   #
df_data3 = df_data2.explode("keywords_json")


if (MY_DEBUG):
   l_cntr = 0
      #
   for l_each in df_data3.itertuples():
      l_cntr += 1
      if (l_cntr < 3):
         print(l_each)
    

def f_keyword_id(i_arg1):
   try:
      l_str1 = str(i_arg1) 
      l_str2 = l_str1.replace("'", "\"")
      l_str3 = json.loads(l_str2)
         #
      l_return = l_str3["id"]
   except:
      l_return = -1
          
   return l_return


df_data3["movie_id"  ] = df_data3.id.map(lambda x: x)
   #
df_data3["keyword_id"] = df_data3.keywords_json.map(lambda x: f_keyword_id(x) )
   #
df_data4 = df_data3[["movie_id", "keyword_id"]]
   #
df_edges = df_data4.drop_duplicates()
df_edges = df_edges.assign(TYPE=lambda x: "RELATES_TO")


print("")
print("Number of input records: %-8d   When pivoted out: %-8d   When de-duplicated: %-8d" % ( len(df_data2.index), len(df_data3.index), len(df_edges.index) ))
print("")


if (MY_DEBUG):
   l_cntr = 0
      #
   for l_each in df_edges.itertuples():
      l_cntr += 1
         #
      if (l_cntr < 3):
         print(l_each)
            
del df_data2
del df_data3
del df_data4
        
print("--")


# Sample output,
#
#     Pandas(Index=0, id='862', keywords_json={'id': 931, 'name': 'jealousy'})
#     Pandas(Index=0, id='862', keywords_json={'id': 4290, 'name': 'toy'})
#     
#     Number of input records: 46419      When pivoted out: 168018     When de-duplicated: 165494  
#     
#     Pandas(Index=0, movie_id='862', keyword_id=931, TYPE='RELATES_TO')
#     Pandas(Index=0, movie_id='862', keyword_id=4290, TYPE='RELATES_TO')


# Part 00: Checkpoint our current state

Enter:  

   - DF :: df_movies
   - Boolean :: MY_DEBUG
   
Exit:

   - DF :: df_movies
   - DF :: df_keywords
   - DF :: df_edges
   - Boolean :: MY_DEBUG
   

In [None]:

#  df_movies  is our Movie DataFrame
#
print("Number of Movies: %d" % (len(df_movies.index) ))
   #
if (MY_DEBUG):
   print(tabulate(df_movies.head(2), headers='keys', tablefmt='psql'))
   print("")

#  df_keywords  is our Keywords DataFrame
#
print("Number of Keywords: %d" % (len(df_keywords.index) ))
   #
if (MY_DEBUG):
   print(tabulate(df_keywords.head(2), headers='keys', tablefmt='psql'))
   print("")

#  df_edges  is our Edges DataFrame
#
df_edges = df_edges.assign(TYPE=lambda x: "RELATES_TO")
print("Number of Edges: %d" % (len(df_edges.index) ))
   #
if (MY_DEBUG):
   print(tabulate(df_edges.head(2), headers='keys', tablefmt='psql'))
   print("")

print("")
print("--")

#  Sample output
#
#     Number of Movies: 45463
#     +----+------+-----------+-------------------------    
#     |    |   id | title     | overview                       ...                                           | genres_primary   |   genres_primary_id | LABEL   |
#     |----+------+-----------+--------------------            ...  
#     |  0 |  862 | Toy Story | Led by Woody, Andy's toys      ...            d': 10751, 'name': 'Family'}]  | Animation        |                  16 | Movie   |
#     |  1 | 8844 | Jumanji   | When siblings Judy and Pe      ...    sy'}, {'id': 10751, 'name': 'Family'}] | Adventure        |                  12 | Movie   |
#     +----+------+-----------+------------------
#     
#     Number of Keywords: 19424
#     +----+---------------------------------+---------+
#     |    | keywords_str                    | LABEL   |
#     |----+---------------------------------+---------|
#     |  0 | {'id': 931, 'name': 'jealousy'} | Keyword |
#     |  0 | {'id': 4290, 'name': 'toy'}     | Keyword |
#     +----+---------------------------------+---------+
#
#     Number of Keywords: 165494
#     +----+------------+--------------+------------+
#     |    |   movie_id |   keyword_id | TYPE       |
#     |----+------------+--------------+------------|
#     |  0 |        862 |          931 | RELATES_TO |
#     |  0 |        862 |         4290 | RELATES_TO |
#     +----+------------+--------------+------------+


#  Part 05: Setting up (n) graphs, plus some useful checking

Enter:  

   - DF :: df_movies
   - DF :: df_keywords
   - DF :: df_edges
   - Boolean :: MY_DEBUG
   
Exit:

   - **my_graph1**            A similarity graph, used for our first GNN routine
   - **my_graph2**            A homogeneous, bi-partite graph, used for our second GNN routine
   -    ...
   - DF :: df_movies
   - DF :: df_keywords
   - DF :: df_edges
   - Boolean :: MY_DEBUG


In [None]:

#  The KatanaGraph remote API is expected to run from a node external to
#  the Katana Graph cluster itself.
#
#  This differs fro mthe distributed API, which is meant to run primitives
#  on the Katana Graph worker nodes.
#

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME1     = "my_graph1"                         #  Much later we may use 3 graphs. We could
GRAPH_NAME2     = "my_graph2"                         #  also just use 'projections'.
GRAPH_NAME3     = "my_graph3"

print("--")


In [None]:
#  This section; basic graph and database setup, reset for test

In [None]:
#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")

In [None]:
#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:
#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:
#  CREATE GRAPHS

my_graph1=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME1, num_partitions=NUM_PARTITIONS)
my_graph2=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME2, num_partitions=NUM_PARTITIONS)
my_graph3=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME3, num_partitions=NUM_PARTITIONS)

print(my_graph1)
print(my_graph2)
print(my_graph3)

my_graph = my_graph1


#  Part 06: Actually import into the graphs

Enter:  
    
   - **my_graph1**            A similarity graph, used for our first GNN routine
   - **my_graph2**            A homogeneous, bi-partite graph, used for our second GNN routine
   -    ...
   - DF :: df_movies
   - DF :: df_keywords
   - DF :: df_edges
   - Boolean :: MY_DEBUG
   
Exit:

   - **my_graph1**            A similarity graph, used for our first GNN routine
   - **my_graph2**            A homogeneous, bi-partite graph, used for our second GNN routine
   -    ...
   - DF :: df_movies
   - DF :: df_keywords
   - DF :: df_edges
   - Boolean :: MY_DEBUG


In [None]:

#  Need to drop movies.genres_json because of this issue,
#
#     ValueError: Failed to convert partition to expected pyarrow schema:
#         `ArrowTypeError("Expected bytes, got a 'list' object", 'Conversion failed for column genres_json with type object')`
#     
#     Expected partition schema:
#         id: string
#         title: string
#              ...
#         vote_count: string
#         genres_json: string                                              <-------
#         genres_primary: string
#         genres_primary_id: string
#         LABEL: string
#     
#     Received partition schema:
#         id: string
#         title: string
#              ...
#         vote_count: string
#         genres_json: list<item: struct<id: int64, name: string>>         <-------
#           child 0, item: struct<id: int64, name: string>
#               child 0, id: int64
#               child 1, name: string
#         genres_primary: string
#         genres_primary_id: int64
#         LABEL: string


df_movies.drop("genres_json", axis=1)
df_movies.compute()

print("--")
    

In [None]:
# ##################################################################
#
#  Finally now, load the vertices/nodes into a graph
#
#  Some hinkiness we need to work around ..
#
#     .  The Dask DataFrames here were loaded from CSV, and those CSV
#        files were found, in scope.
#        The KG DataFrame importer will reference that same file
#        pathname, and the file will not be in scope. Basically,
#        it was expected that these files be on S3/GS all along.
#        I hate to have that dependency because, just one more thing
#        to have to manage.

pd_movies      = df_movies.compute()
pd_keywords    = df_keywords.compute()
pd_edges       = df_edges.compute()

print("--")


In [None]:

#  Import into graph1, out full graph
#

with import_data.DataFrameImporter(my_graph1) as df_importer:
    
   #  Movies
   #
   df_importer.nodes_dataframe(
      pd_movies[[
         "id", "title", "overview", "tagline", "budget", "genres", "popularity", "production_companies",
         "release_date", "revenue", "runtime", "vote_average", "vote_count", "genres_primary",
         "genres_primary_id", "LABEL"
      ]],
      id_column  = "id",
      id_space   = "Movies"
      )

   #  Keywords
   #
   df_importer.nodes_dataframe(
      pd_keywords[["id", "name", "LABEL"]],
      id_column  = "id",
      id_space   = "Keywords"
      )  
    
   #  Edge, RELATES_TO
   #
   df_importer.edges_dataframe(
      pd_edges[["movie_id", "keyword_id", "TYPE"]],
      source_id_space      = "Movies",
      destination_id_space = "Keywords",
      source_column        = "movie_id",
      destination_column   = "keyword_id",
      type                 = "RELATES_TO"
      )
   df_importer.edges_dataframe(
      pd_edges[["movie_id", "keyword_id", "TYPE"]],
      source_id_space      = "Keywords",
      destination_id_space = "Movies",
      source_column        = "keyword_id",
      destination_column   = "movie_id",
      type                 = "RELATES_TO"
      )

   df_importer.execute()


print("--")
    

In [None]:

#  UI for choosing which of 3 graphs to point to
#
#     (Much like HTML, this visual control has an ID. So, you can only have
#      one of these. IF you need two, you can, but you have more variable
#      work to do.)

import ipywidgets as widgets

#  Support function for a radio button UI used below
#

l_arr = [
   "Graph 01 - Full graph",
   "Graph 02 - Similarity graph, just nodes of one type",
   "Graph 03 - Bi-paritite graph",
]


def f_set_graph():
   global l_whichgraph

#  if   (l_whichgraph.value == "Graph 02 - Similarity graph, just nodes of one type"):
   if   (l_whichgraph.value == l_arr[1]):
      l_str = GRAPH_NAME2
   elif (l_whichgraph.value == l_arr[2]):
      l_str = GRAPH_NAME3
   else:
      l_str = GRAPH_NAME1
   
   try:
      l_my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(l_str)
      return l_my_graph
   except:
      return None


l_whichgraph = widgets.RadioButtons(
   options = [
      l_arr[0],
      l_arr[1],
      l_arr[2],
   ],
   default     = l_arr[0],
   description = "Use this: ",
   disabled    = False,
   layout      = {'width': 'max-content'}
)

l_whichgraph


In [None]:

#  Between the DataFrame and the graph we lose 30 nodes-
#     (We should investigate that at some point. It's a little)
#      crazy, because the edges are okay.)

display("Number of DataFrame Nodes: %d" % ( len(df_movies.index) + len(df_keywords.index) ))
display("Number of DataFrame Edges: %d" % ( len(df_edges.index ) * 2                      ))
   #                                       
print("")

my_graph = f_set_graph()
print(my_graph)
                                           
display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))
   #                                       
print("--")

#  Sample output,
#
#     'Number of DataFrame Nodes: 64887'
#     'Number of DataFrame Edges: 330988'
#     'Number of Graph Nodes: 64857'
#     'Number of Graph Edges: 330988'


In [None]:

my_graph = f_set_graph()

l_result = my_graph.query("""

   MATCH (n) -[r]-> (m)
   RETURN n, r, m
   LIMIT 1000                        //  Limit is 25,000 for visualization, smaller is better
   
   """,
   contextualize=True)

l_result.view()



<div> 
<img src="./01_Images/10-Movie-Query-1.png" alt="Drawing" style="width: 1600px;"/>
</div>


In [None]:

l_result = my_graph.query("""

   MATCH (n: Movies) -[r]-> (m)
   WHERE n.id = "8469"                    //  Animal House
   RETURN n, r, m
   
   """,
   contextualize=True)

l_result.view()



<div> 
<img src="./01_Images/11-Movie-Query-2.png" alt="Drawing" style="width: 1600px;"/>
</div>
