#  Notebook overview ..

In [None]:

#  This Notebook is part of a set that demonstrate GNN using a movie dataset.
#  About this Notebook,
#
#  .  There was a Kaggle GNN challenge circa 2019 detailed here,
#        https://www.kaggle.com/c/movie-genre-classification/data
#
#     That data is locked down, but a similarly themed dataset also on 
#     Kaggle is here,
#        https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset
#
#     The above is the data set in use here.
#
#  .  1 GB, plus or minus. Most of that volume comes from reviews. The
#     movies run 30-40 MB, and the cast and crew about 190 MB.
#
#     The data is CSV, with embedded arrays of JSON.
#     To remove dependencies on GS/S3 hosted data, this program expects this
#     data to be local to the container hosting this Jupyter Notebook.
#
#     Since we host on GitHub, and GitHub has a 25 MB file size limit,
#     the total data set is now split across multiple files.
#
#  .  The existing schema for just Movies is listed here,
#  
#        10_movies_metadata.csv
#        -----------------------------------------
#           adult                      ..   False
#           belongs_to_collection      ..
#           budget                     ..   2700000
#           genres                     ..   "[{'id': 35, 'name': 'Comedy'}]"
#           homepage                   ..   http://www.animalhouse.com/
#           id                         ..   8469
#           imdb_id                    ..   tt0077975
#           original_language          ..   en
#           original_title             ..   Animal House
#           overview                   ..   "At a 1962 College, Dean Vernon Wormer is determined to expel
#                                            the entire Delta Tau Chi Fraternity, but those troublemakers
#                                            have other plans for him."
#           popularity                 ..   7.525382
#           poster_path                ..   /AuJkgAh7zAGsm7Oo3CGyDtYvzg0.jpg
#           production_companies       ..   "[{'name': 'Universal Pictures', 'id': 33}, {'name': 'Oregon Film Factory',
#                                               'id': 13298}, {'name': 'Stage III Productions', 'id': 13300}]"
#           production_countries       ..   "[{'iso_3166_1': 'US', 'name': 'United States of America'}]"
#           release_date               ..   1978-07-27
#           revenue                    ..   141000000
#           runtime                    ..   109.0
#           spoken_languages           ..   "[{'iso_639_1': 'en', 'name': 'English'}]"
#           status                     ..   Released
#           tagline                    ..   It was the Deltas against the rules... the rules lost!
#           title                      ..   Animal House
#           video                      ..   False
#           vote_average               ..   7.0
#           vote_count                 ..   420
#
#     From the above, we load the following into a DataFrame of Movies nodes,
#
#           id                         ..   8469
#           title                      ..   Animal House
#           genres                     ..   "[{'id': 35, 'name': 'Comedy'}]"
#           overview                   ..   "At a 1962 College, Dean Vernon Wormer is determined to expel
#                                            the entire Delta Tau Chi Fraternity, but those troublemakers
#                                            have other plans for him."
#           tagline                    ..   It was the Deltas against the rules... the rules lost!
#
#           popularity                 ..   7.525382
#           production_companies       ..   "[{'name': 'Universal Pictures', 'id': 33}, {'name': 'Oregon Film Factory',
#                                               'id': 13298}, {'name': 'Stage III Productions', 'id': 13300}]"
#           release_date               ..   1978-07-27
#           revenue                    ..   141000000
#           runtime                    ..   109.0
#           spoken_languages           ..   "[{'iso_639_1': 'en', 'name': 'English'}]"
#           vote_average               ..   7.0
#           vote_count                 ..   420
#
#      Notice the following from above,
#
#         ..  genres is an array of JSON, with each genre being unique identified via a numeric.
#             We will take the first genre and put it into a property on each node titled, primary_genre.
#         ..  We will leave all remaining JSON untouched, stored as strings.


#  .  The existing schema is for Keywords is listed here,
#
#        11_keywords.csv
#        -----------------------------------------
#           id                         ..   8469
#           keywords                   ..   "[{'id': 572, 'name': 'sex'}, {'id': 2483, 'name': 'nudity'},
#                                             {'id': 3616, 'name': 'college'}, {'id': 157632, 'name': 'fraternity'},
#                                             {'id': 158507, 'name': 'gross out comedy'}, {'id': 160450, 'name': 'dean'},
#                                             {'id': 171400, 'name': 'fraternity house'}, {'id': 208983, 'name': 'probation'},
#                                             {'id': 208992, 'name': '1960s'}, {'id': 209506, 'name': 'college freshman'},
#                                             {'id': 236316, 'name': 'anarchic comedy'}]"
#
#      From the above, the following is offered,
#   
#         ..  id  joins with movie.id
#         ..  keywords.id  already enumerates keywords associated with the movies for us.
#             Super handy.


#  .  We also have data for,
# 
#        ..  12_Credits  (split into; Cast, Crew)
#        ..  14|15_Ratings
#        ..  16|17_(External) Links
#
#     And will likely add these at a later date.


#  Below we continue by loading the raw data, and performing some validations on
#  statements made, assumptions, and similar.



#  Load just Movies into a DataFrame, perform basic, sanity check analysis ..

In [None]:

import dask.dataframe as dd
import numpy as np
   #
import json

from tabulate import tabulate

print("--")


In [None]:

#  Load DataFrame with raw input data

l_InputFiles  = [
   "./02_Files/40_Movies_01.txt",
   "./02_Files/41_Movies_02.txt",
]

df_data = dd.read_csv(
   l_InputFiles,
   delimiter  = ",",
   skiprows   = 1,                                            #  Skip the first line of each file, since it's the column headers
   dtype      = {
      "adult"                     : np.dtype(str),
      "belongs_to_collection"     : np.dtype(str),
      "budget"                    : np.dtype(str),
      "genres"                    : np.dtype(str),
      "homepage"                  : np.dtype(str),
      "id"                        : np.dtype(str),
      "imdb_id"                   : np.dtype(str),
      "original_language"         : np.dtype(str),
      "original_title"            : np.dtype(str),
      "overview"                  : np.dtype(str),
      "popularity"                : np.dtype(str),
      "poster_path"               : np.dtype(str),
      "production_companies"      : np.dtype(str),
      "production_countries"      : np.dtype(str),
      "release_date"              : np.dtype(str),
      "revenue"                   : np.dtype(str),
      "runtime"                   : np.dtype(str),
      "spoken_languages"          : np.dtype(str),
      "status"                    : np.dtype(str),
      "tagline"                   : np.dtype(str),
      "title"                     : np.dtype(str),
      "video"                     : np.dtype(str),
      "vote_average"              : np.dtype(str),
      "vote_count"                : np.dtype(str),
      },
   names      = [
      "adult", "belongs_to_collection", "budget", "genres", "homepage", "id", "imdb_id",
      "original_language", "original_title", "overview", "popularity", "poster_path",
      "production_companies", "production_countries", "release_date", "revenue", "runtime",
      "spoken_languages", "status", "tagline", "title", "video", "vote_average", "vote_count",
      ]
   )   

df_data.compute()

print("--")


In [None]:

#  Initial look at the data, sanity check-
#

#  print(len(df_data.index))
#     #
#  print(tabulate(df_data.head(2), headers='keys', tablefmt='psql'))
#  
#  print("--")

#  Sample output,
#
#  45466
#
#  +----+---------+----------------------------------------------------------------------------------------------
#  |    | adult   | belongs_to_collection                                                                       
#  |----+---------+--------------------------------------------------------------------------------------------
#  |  0 | False   | {'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
#     'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'} | 30000000 | [{'id': 16, 'name': 'Animation'},
#     {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]  | http://toystory.disney.com/toy-story |  862 |
#     tt0114709 | en                  | Toy Story        | Led by Woody, Andy's toys live happily in his room until
#        Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots
#        against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put
#        aside their differences.                                                                                             
#     |      21.9469 | /rhIRbceoE9lR4veEXuwCC2wARtG.jpg | [{'name': 'Pixar Animation Studios', 'id': 3}]                                                                                      | [{'iso_3166_1': 'US', 'name': 'United States of America'}] | 1995-10-30     | 373554033 |        81 | [{'iso_639_1': 'en', 'name': 'English'}]                                          | Released | nan                                       | Toy Story | False   |            7.7 |         5415 |


In [None]:

#  Drop unwanted columns
#
df_data2 = df_data[["id", "title", "overview", "tagline", "budget", "genres",
   "popularity", "production_companies", "release_date", "revenue", "runtime",
   "vote_average", "vote_count", ]]

df_data2.compute()

#  print(tabulate(df_data2.head(2), headers='keys', tablefmt='psql'))

print("--")


#  Check just genres

In [None]:

#  Check column type of genres
#

#  l_cntr = 0
#     #
#  for l_each in df_data2.itertuples():
#     l_cntr += 1
#     if (l_cntr < 3):
#        print(type(l_each.genres))
#        print(     l_each.genres )

#  Sample output,
#
#     <class 'str'>
#     [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
#     <class 'str'>
#     [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]


In [None]:

#  . Save the original 'genres' string as 'genres_str'.
#  . Save the original 'genres' string as JSON, 'genres_json'.
#  . Save the first 'genres.name' as 'genres_primary'.

def f_genres_json(i_arg1):
   try:
      l_str1 = str(i_arg1)                                                  #  Needed this, was getting odd  json.loads()  errors otherwise
      l_str2 = l_str1.replace("'", "\"")
      l_str3 = json.loads(l_str2)
      l_return = l_str3
   except:
      l_return = json.loads('[{"id": 99999999, "name": "Unknown"}]')
   return l_return

def f_genres_primary(i_arg1):
   try:
      l_return = i_arg1[0]["name"]
   except:
      l_return = "Unknown"
   return l_return

def f_genres_primary_id(i_arg1):
   try:
      l_return = i_arg1[0]["id"]
   except:
      l_return = -1
   return l_return

#    The reason for the if is to prevent error upon multiple
#    executions of this code.
#
if ("genres_json" not in df_data2):
   df_data2["genres_json"      ] = df_data2.genres.map     (lambda x: f_genres_json(x)       )
      #
   df_data2["genres_primary"   ] = df_data2.genres_json.map(lambda x: f_genres_primary   (x) )
   df_data2["genres_primary_id"] = df_data2.genres_json.map(lambda x: f_genres_primary_id(x) )
   print("--")
    
print("--")


#  l_cntr = 0
#     #
#  for l_each in df_data2.itertuples():
#     l_cntr += 1
#        #
#     if (l_cntr < 3):
#        print(type(l_each))
#        print(     l_each )
#        print("")
        
#  Sample output,
#
#     <class 'pandas.core.frame.Pandas'>
#     Pandas(Index=0, id='862', title='Toy Story', overview="Led by Woody, Andy's toys live happily in his room until Andy's birthday
#        brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when
#        circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
#     tagline=nan, budget='30000000', genres="[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751,
#        'name': 'Family'}]", popularity='21.946943', production_companies="[{'name': 'Pixar Animation Studios', 'id': 3}]",
#     release_date='1995-10-30', revenue='373554033', runtime='81.0', vote_average='7.7', vote_count='5415', 
#     genres_json=[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}],
#     genres_primary='Animation', genres_primary_id=16)


In [None]:

#  Analysis on just genres- How many unique values do we have ?
#
#     .  Copy just genres_json into a new DataFrame. 
#     .  Extract all 'genres_json.name' from the JSON string into list.
#     .  Pivot this list of genres names into separate rows.
#     .  Count the unique genres names.


df_genres = df_data2[["genres_json"]]


#  Convert the genres_json array of dictionaries into an array of just genres.names
#
def f_genres_arr(i_arg1):
   l_arr  = []
      #
   try:
      for l_each in i_arg1:
         l_name = l_each["name"]
         l_arr += [l_name]
      l_return = l_arr
   except:
      l_return = [ "Unknown" ]
   return l_return

df_genres["genres_names"] = df_genres.genres_json.map(lambda x: f_genres_arr(x), meta=("genres_json", "object"))


#  Count the above with a group by, and sort
#
df_genres2 = df_genres.explode("genres_names")
   #
df_genres3 = df_genres2.groupby("genres_names")["genres_names"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)
    

#  Output for review
#
l_cntr = 0
   #
for l_each in df_genres3.itertuples():
   l_cntr += 1
      #
   if (l_cntr < 50):
      print("Genre name: %-48s  %d" % (l_each.genres_names, l_each.count))
        
print("Total: %d" % (len(df_genres3.index)))
print("")


#  See how the above differs from df_data2.genres_primary
#
df_genres4 = df_data2.groupby("genres_primary")["genres_primary"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)
    

#  Output for review
#
l_cntr = 0
   #
for l_each in df_genres4.itertuples():
   l_cntr += 1
      #
   if (l_cntr < 50):
      print("Genre name: %-48s  %d" % (l_each.genres_primary, l_each.count))
 
print("Total: %d" % (len(df_genres4.index)))
print("")


#  Sample output,
#
#     Genre name: Drama                                             20265
#     Genre name: Comedy                                            13182
#     Genre name: Thriller                                          7624
#     Genre name: Romance                                           6735
#     Genre name: Action                                            6596
#     Genre name: Horror                                            4673
#     Genre name: Crime                                             4307
#     Genre name: Documentary                                       3932
#     Genre name: Adventure                                         3496
#     Genre name: Science Fiction                                   3049
#     Genre name: Family                                            2770
#     Genre name: Mystery                                           2467
#     Genre name: Fantasy                                           2313
#     Genre name: Animation                                         1935
#     Genre name: Foreign                                           1622
#     Genre name: Music                                             1598
#     Genre name: History                                           1398
#     Genre name: War                                               1323
#     Genre name: Western                                           1042
#     Genre name: TV Movie                                          767
#     Genre name: Odyssey Media                                     1
#     Genre name: Pulser Productions                                1
#     Genre name: Rogue State                                       1
#     Genre name: Vision View Entertainment                         1
#     Genre name: Mardock Scramble Production Committee             1
#     Genre name: Telescene Film Group Productions                  1
#     Genre name: Sentai Filmworks                                  1
#     Genre name: GoHands                                           1
#     Genre name: Carousel Productions                              1
#     Genre name: BROSTA TV                                         1
#     Genre name: Aniplex                                           1
#     Genre name: The Cartel                                        1
#     Total: 32
#     
#     Genre name: Drama                                             11966
#     Genre name: Comedy                                            8820
#     Genre name: Action                                            4489
#     Genre name: Documentary                                       3415
#     Genre name: Horror                                            2619
#     Genre name: Unknown                                           2442
#     Genre name: Crime                                             1685
#     Genre name: Thriller                                          1665
#     Genre name: Adventure                                         1514
#     Genre name: Romance                                           1191
#     Genre name: Animation                                         1124
#     Genre name: Fantasy                                           704
#     Genre name: Science Fiction                                   647
#     Genre name: Mystery                                           554
#     Genre name: Family                                            524
#     Genre name: Music                                             487
#     Genre name: Western                                           451
#     Genre name: TV Movie                                          390
#     Genre name: War                                               379
#     Genre name: History                                           279
#     Genre name: Foreign                                           118
#     Genre name: Carousel Productions                              1
#     Genre name: Aniplex                                           1
#     Genre name: Odyssey Media                                     1
#     Total: 24


# Further checks, corrections

In [None]:

#  Is movies.id  present, numeric ?

#  l_cntr = 0
#  l_fail = 0
#     #
#  l_set  = set()
#     #
#  for l_each in df_data2.itertuples():
#     l_cntr += 1
#     try:
#        l_xxx   = int(l_each.id)
#     except:
#        print("Movie with bad id value: %s" % (l_each.id) )
#        l_set.add(l_each.id)
#        l_fail+= 1

#  print("Number of total Movies: %d  Number with numeric id: %d   Number with a non-numeric id: %d" % (l_cntr, (l_cntr - l_fail), l_fail ))
#  print("")


#  Filter out those 'bad' movie id values
#
df_movies = df_data2[df_data2.id.str.isnumeric()]


#  l_cntr = 0
#  l_fail = 0
#     #
#  for l_each in df_movies.itertuples():
#     l_cntr += 1
#     try:
#        l_xxx   = int(l_each.id)
#     except:
#        print("Movie with bad id value: %s" % (l_each.id) )
#        l_fail+= 1

#  print("Number of total Movies: %d  Number with numeric id: %d   Number with a non-numeric id: %d" % (l_cntr, (l_cntr - l_fail), l_fail ))
    
    
#  Sample output,
#
#     Movie with bad id value: 1997-08-20
#     Movie with bad id value: 2012-09-29
#     Movie with bad id value: 2014-01-01
#     Number of total Movies: 45466  Number with numeric id: 45463   Number with a non-numeric id: 3
#     
#     Number of total Movies: 45463  Number with numeric id: 45463   Number with a non-numeric id: 0


In [None]:

#  Delete old/past DataFrames, release memory

del df_data
del df_data2
#
del df_genres
del df_genres2
del df_genres3
del df_genres4

print("--")


#  Checkpoint: our current state

In [None]:

#  We have a DataFrame titled,  df_movies  with the following features,
#
#     id                         ..   8469
#     title                      ..   Animal House
#     overview                   ..   "At a 1962 College, Dean Vernon Wormer is determined to expel
#                                      the entire Delta Tau Chi Fraternity, but those troublemakers
#                                      have other plans for him."
#     tagline                    ..   It was the Deltas against the rules... the rules lost!
#     budget                     ..   2700000
#     genres                     ..   "[{'id': 35, 'name': 'Comedy'}]"
#     popularity                 ..   7.525382
#     production_companies       ..   "[{'name': 'Universal Pictures', 'id': 33}, {'name': 'Oregon Film Factory',
#                                         'id': 13298}, {'name': 'Stage III Productions', 'id': 13300}]"
#     release_date               ..   1978-07-27
#     revenue                    ..   141000000
#     runtime                    ..   109.0
#     vote_average               ..   7.0
#     vote_count                 ..   420
#
#     genres_json                ..   (same as above, case as JSON/dictionary)
#     genres_primary             ..   Just the first genres.name, a string
#     genres_primary_id          ..   Just the first genres.id, an integer
    
    
#  Currently, our GNN requires a bi-partitite graph. We have additional data
#  sets for,
#
#        11_keywords.csv
#        -----------------------------------------
#           id                         ..   8469
#           keywords                   ..   "[{'id': 572, 'name': 'sex'}, {'id': 2483, 'name': 'nudity'},
#                                             {'id': 3616, 'name': 'college'}, {'id': 157632, 'name': 'fraternity'},
#                                             {'id': 158507, 'name': 'gross out comedy'}, {'id': 160450, 'name': 'dean'},
#                                             {'id': 171400, 'name': 'fraternity house'}, {'id': 208983, 'name': 'probation'},
#                                             {'id': 208992, 'name': '1960s'}, {'id': 209506, 'name': 'college freshman'},
#                                             {'id': 236316, 'name': 'anarchic comedy'}]"
#
#      From the above, the following is offered,
#   
#         ..  id  joins with movie.id
#         ..  keywords.id  already enumerates keywords associated with the movies for us.
#             Super handy.
#
#  .  We also have data for,
# 
#        ..  12_Credits  (split into; Cast, Crew)
#        ..  14|15_Ratings
#        ..  16|17_(External) Links


#  From here, we proceed with just  keywords
#


# Work on Keywords

In [None]:

#  Load DataFrame with raw input data

l_InputFiles  = [
   "./02_Files/50_Keywords_00.txt",
]

df_data = dd.read_csv(
   l_InputFiles,
   delimiter  = ",",
   skiprows   = 1,                                            #  Skip the first line of each file, since it's the column headers
   dtype      = {
      "id"                        : np.dtype(str),
      "keywords"                  : np.dtype(str),            #  In the source CSV, this column was titled 'values', a bad idea
      },
   names      = [
      "id", "keywords"
      ]
   )   

df_data.compute()

print("--")


In [None]:

#  Initial look at the data, sanity check-
#

#  print(len(df_data.index))
#     #
#  print(tabulate(df_data.head(2), headers='keys', tablefmt='psql'))


#  l_cntr = 0
#  l_fail = 0
#     #
#  for l_each in df_data.itertuples():
#     l_cntr += 1
#     try:
#        l_xxx   = int(l_each.id)
#     except:
#        print("Keyword with bad id value: %s" % (l_each.id) )
#        l_fail+= 1

#  print("")
#     #
#  print("Number of total Keywords: %d  Number with numeric id: %d   Number with a non-numeric id: %d" % (l_cntr, (l_cntr - l_fail), l_fail ))

#  print("--")


#  Sample output,
#
#     46419
#     +----+------+------------------------------------------------------------------------------------------------------------------------------------
#     |    |   id | keywords                                                                                                                                                                                                                                                                                                                   |
#     |----+------+--------------------------------------------------------------------------------------------------------------------------------
#     |  0 |  862 | [{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'},  ...
#     |  1 | 8844 | [{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'name': "based on children's book"},  ...
#     +----+------+------------------------------------------------------------------------------------------------------------------------------------
#
#     Number of total Keywords: 46419  Number with numeric id: 46419   Number with a non-numeric id: 0


In [None]:

#  In  df_data  (our Keywords), is a column titled, 'values'.
#  This is an array of dictionaries that we will use for many purposes.
#  As such, here, cast and copy it out to a new column titled,
#  'keywords_json'.
#

def f_keywords_json(i_arg1):
   try:
      l_str1 = str(i_arg1)                                                  #  Needed this, was getting odd  json.loads()  errors otherwise
      l_str2 = l_str1.replace("'", "\"")
      l_str3 = json.loads(l_str2)
      l_return = l_str3
   except:
      l_return = json.loads('[{"id": 99999999, "name": "Unknown"}]')
   return l_return

#    The reason for the if is to prevent error upon multiple
#    executions of this code.
#
if ("keywords_json" not in df_data):
   df_data["keywords_json"] = df_data.keywords.map(lambda x: f_keywords_json(x) )
   print("--")
    
print("--")


In [None]:

#  Build a new DataFrame with the unique Keywords. 
#
#  We'll use this to count, but also this can be our list of node for the graph
#  of Label, Keywords.

df_keywords  = df_data.explode(keywords_json)
   #
df_keywords2 =  dd_[["state_code"]].drop_duplicates()






print("Number of input




#  def f_genres_arr(i_arg1):
#     l_arr  = []
#        #
#     try:
#        for l_each in i_arg1:
#           l_id   = l_each["name"]
#           l_name = l_each["name"]
#           l_arr += [l_name]
#        l_return = l_arr
#     except:
#        l_return = [ "Unknown" ]
#     return l_return
#  
#  df_genres["genres_names"] = df_genres.genres_json.map(lambda x: f_genres_arr(x), meta=("genres_json", "object"))
#  
#  
#  #  Count the above with a group by, and sort
#  #
#  df_genres2 = df_genres.explode("genres_names")
#     #
#  df_genres3 = df_genres2.groupby("genres_names")["genres_names"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)
#      
#  
#  #  Output for review
#  #
#  l_cntr = 0
#     #
#  for l_each in df_genres3.itertuples():
#     l_cntr += 1
#        #
#     if (l_cntr < 50):
#        print("Genre name: %-48s  %d" % (l_each.genres_names, l_each.count))
#          
#  print("Total: %d" % (len(df_genres3.index)))
#  print("")





#  Building the DataFrames that will go into the graph ..

#  Our standard graph, Nodes/Edges ..

In [None]:

#  Our standard graph, Nodes

df_movies    = df_data[["release_year", "title", "wiki_url", "plot"]].compute()
print("Movies...... " + str(len(df_movies.index)))
   #
df_genres    = df_data[["genre"  ]].drop_duplicates().compute()
print("Genres...... " + str(len(df_genres.index)))
df_countries = df_data[["country"]].drop_duplicates().compute()
print("Countries... " + str(len(df_countries.index)))
   #
df_persons   = df_data[["director"]].drop_duplicates().compute()
print("Persons..... " + str(len(df_persons.index)))




#  Our standard graph, Edges


print("--")

#  Sample output
#


In [None]:

#  Genres looks odd at 2000+ unique entries. Look deeper at that-
#
#  We'll write df_genres to a file local to the Jupyter Docker container, so we can view it in an editor

df_genres.to_csv("02_Files/40_genres.txt", index=None, sep="|")

print("--")

#  Sample output
#
#  genre
#  unknown
#  western
#  comedy
#  short
#  short action/crime western
#  short film
#  biographical
#  drama
#  adventure


In [None]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")

In [None]:
# ##################################################################
#
#  This section; basic graph and database setup, reset for test


In [None]:
#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")

In [None]:
#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)

In [None]:
#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)

In [None]:
#  CREATE A GRAPH

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


In [None]:
#  CONNECT TO GRAPH

for l_graph in my_client.get_database(name=DB_NAME).graphs_in_database():
   if (l_graph.name == GRAPH_NAME):
      my_graph=my_client.get_database(name=DB_NAME).get_graph_by_id(id=l_graph.graph_id)
         #
      break

# my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)

In [None]:
display(my_graph.num_nodes())
display(my_graph.num_edges())

In [None]:
# ##################################################################
#
#  Load from source CSV, in this case we are using the Neo4J Movie graph


In [None]:
import dask.dataframe as dd
import numpy as np

print("--")


In [None]:
#  Load a DataFrame from CSV, Nodes/Vertices

l_InputFile  = "./10_NMovieDB/24_nodes.txt"

df_all_nodes1 = dd.read_csv(
   l_InputFile,
   delimiter = ",",
   dtype = {
      # "id"        : int,
      "id"        : np.dtype('O'),
      "_labels"   : np.dtype('O'),
      # "born"      : float, 
      "born"      : np.dtype('O'),
      "name"      : np.dtype('O'),
      # "released"  : float,
      "released"  : np.dtype('O'),
      "tagline"   : np.dtype('O'),
      "title"     : np.dtype('O')
      })

print("--")


In [None]:
#  Number of rows, two different ways

display(len(df_all_nodes1))
display(print("{} Rows".format(df_all_nodes1.shape[0].compute())))

#  Other output

display(df_all_nodes1.head(10))
display(df_all_nodes1[["born", "name"]].head(10))

In [None]:
#  Print all rows, subsetted columns

for l_each in df_all_nodes1.iterrows():
   print(l_each[0], "   ", l_each[1]["_labels"], "   ", l_each[1]["name"])  


In [None]:
# ##################################################################
#
#   https://www.askpython.com/python/examples/subset-a-dataframe
#   https://www.codegrepper.com/code-examples/python/convert+float+to+int+python+pandas
#   https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.assign.html


In [None]:
#  We have some issues we want to change with our data
#
#  .  Some of the property names have a leading underscore. Change those.
#  .  Some values which should be integer, are float.
#  .  The label values are currently ";Person" and ";Movie". Let's remove those semicolons.


In [None]:
#  Column renames

df_all_nodes2 = df_all_nodes1.rename(columns={"_id": "id", "_labels": "label"})

print("--")

In [None]:
#  Change float values to integer, remove leading semicolon from label

df_all_nodes3 = df_all_nodes2.assign(
   # born     = lambda x: x.born.fillna(0.0).astype(int), 
   born     = lambda x: x.born.fillna(0.0).astype(str), 
   # id       = lambda x: x.id.fillna(0.0).astype(int),
   id       = lambda x: x.id.fillna(0.0).astype(str),
   # released = lambda x: x.released.fillna(0.0).astype(int),
   released = lambda x: x.released.fillna(0.0).astype(str),
   label    = lambda x: x.label.astype(str).str[1:]
   )

print("--")

In [None]:
display(len(df_all_nodes3))
display(df_all_nodes3.head(10))

In [None]:
# ##################################################################

In [None]:
#  Now, populate Persons and Movies

df_persons = df_all_nodes3[df_all_nodes3["label"] == "Person"][["id", "label", "born", "name"]]

df_movies  = df_all_nodes3[df_all_nodes3["label"] == "Movie"][["id", "label", "released", "tagline", "title"]]

print("--")

In [None]:

display(len(df_persons))
display(df_persons.head(10))
display(len(df_movies))
display(df_movies.head(10))


In [None]:
# ##################################################################
#
#  Repeat the style of work from above, but now for edges
#
#  .  Some of the property names have a leading underscore. Change those.
#  .  Some values which should be integer, are float.


In [None]:
#  Load a DataFrame from CSV, Edges

l_InputFile  = "./10_NMovieDB/25_edges.txt"

df_all_edges1 = dd.read_csv(
   l_InputFile,
   delimiter = ",",
   dtype = {
      # "_start"    : float,
      "_start"    : np.dtype('O'),
      # "_end"      : float, 
      "_end"      : np.dtype('O'),
      "_type"     : np.dtype('O'),
      # "rating"    : float,
      "rating"    : np.dtype('O'),
      "roles"     : np.dtype('O'),
      "summary"   : np.dtype('O')
      })

print("--")

In [None]:
display(len(df_all_edges1))
display(df_all_edges1.head(10))


In [None]:
df_all_edges2 = df_all_edges1.rename(columns={"_start": "START_ID", "_end": "END_ID", "_type": "TYPE",
   "rating": "RATING", "roles": "ROLES", "summary": "SUMMARY"})

df_all_edges3 = df_all_edges2.assign(
   # START_ID = lambda x: x.START_ID.fillna(0.0).astype(int), 
   START_ID = lambda x: x.START_ID.fillna(0.0).astype(str), 
   # END_ID   = lambda x: x.END_ID.fillna(0.0).astype(int),
   END_ID   = lambda x: x.END_ID.fillna(0.0).astype(str),
   # RATING   = lambda x: x.RATING.fillna(0.0).astype(int)
   RATING   = lambda x: x.RATING.fillna(0.0).astype(str)
   )

print("--")

In [None]:
display(len(df_all_edges3))
display(df_all_edges3.head(10))

In [None]:
#  Split out the various edges

df_reviewed = df_all_edges3[df_all_edges3["TYPE"] == "REVIEWED"][["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]]

df_wrote    = df_all_edges3[df_all_edges3["TYPE"] == "WROTE"   ][["START_ID", "END_ID", "TYPE"]]

df_produced = df_all_edges3[df_all_edges3["TYPE"] == "PRODUCED"][["START_ID", "END_ID", "TYPE"]]

df_directed = df_all_edges3[df_all_edges3["TYPE"] == "DIRECTED"][["START_ID", "END_ID", "TYPE"]]

df_follows  = df_all_edges3[df_all_edges3["TYPE"] == "FOLLOWS" ][["START_ID", "END_ID", "TYPE"]]

#  "roles" is a string similar to  '[ "a", "b", "c" ]'
#
#  This was automatically coming in as a list-
#  Cool

df_actedin  = df_all_edges3[df_all_edges3["TYPE"] == "ACTED_IN"][["START_ID", "END_ID", "TYPE", "ROLES"]]

print("--")

In [None]:

display(len(df_reviewed))
display(df_reviewed.head(2))

display(len(df_wrote))
display(df_wrote.head(2))

display(len(df_produced))
display(df_produced.head(2))

display(len(df_directed))
display(df_directed.head(2))

display(len(df_follows))
display(df_follows.head(2))

display(len(df_actedin))
display(df_actedin.head(2))


In [None]:
# ##################################################################

In [None]:
#  Didn't need this; also don't know if it had any effect

# from dask.distributed import Client
# client = Client(n_workers=4, threads_per_worker=2)

# print("--")

In [None]:
# ##################################################################
#
#  Finally now, load the vertices/nodes into a graph
#
#  Some hinkiness we need to work around ..
#
#     .  The Dask DataFrames here were loaded from CSV, and those CSV
#        files were found, in scope.
#        The KG DataFrame importer will reference that same file
#        pathname, and the file will not be in scope. Basically,
#        it was expected that these files be on S3/GS all along.
#        I hate to have that dependency because, just one more thing
#        to have to manage.
#
#     .  So, we'll copy the DataFrames to Dask arrays, then back into
#        a Dask DataFrame.
#        Why not just copy the DaskDataFrame ?  Currently there is only 
#        shallow copies of DataFrames.
#
#  See,
#     https://stackoverflow.com/questions/52119342/how-do-i-convert-a-dask-dataframe-into-a-dask-array
#     https://docs.dask.org/en/latest/generated/dask.dataframe.from_dask_array.html


da_persons    = df_persons.to_dask_array()
da_movies     = df_movies.to_dask_array()
   #
da_directed   = df_directed.to_dask_array()
da_reviewed   = df_reviewed.to_dask_array()
da_wrote      = df_wrote.to_dask_array()
da_produced   = df_produced.to_dask_array()
da_follows    = df_follows.to_dask_array()
da_actedin    = df_actedin.to_dask_array()


df_persons2   = dd.io.from_dask_array(da_persons,  columns=["id", "label", "born", "name"]).compute()
df_movies2    = dd.io.from_dask_array(da_movies,   columns=["id", "label", "released", "tagline", "title"]).compute()
   #
df_directed2  = dd.io.from_dask_array(da_directed, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_reviewed2  = dd.io.from_dask_array(da_reviewed, columns=["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]).compute()
df_wrote2     = dd.io.from_dask_array(da_wrote, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_produced2  = dd.io.from_dask_array(da_produced, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_follows2   = dd.io.from_dask_array(da_follows, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_actedin2   = dd.io.from_dask_array(da_actedin, columns=["START_ID", "END_ID", "TYPE", "ROLES"]).compute()

print("--")



In [None]:
from katana_enterprise.remote import import_data

print("--")


In [None]:

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   # Person
   #
   df_importer.nodes_dataframe(
      df_persons2[["id", "label", "born", "name"]],
      id_column  = "id",
      id_space   = "Person"
      )
   #  Movie
   #
   df_importer.nodes_dataframe(
      df_movies2[["id", "label", "title", "tagline"]],
      id_column  = "id",
      id_space   = "Movie"
      )  
    
   #  DIRECTED
   #
   df_importer.edges_dataframe(
      df_directed2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "DIRECTED"
      )
   #  REVIEWED
   #
   df_importer.edges_dataframe(
      df_reviewed2[["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "REVIEWED"
      )
   #  WROTE
   #
   df_importer.edges_dataframe(
      df_wrote2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "WROTE"
      )
   #  PRODUCED
   #
   df_importer.edges_dataframe(
      df_produced2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "PRODUCED"
      )
   #  FOLLOWS
   #
   df_importer.edges_dataframe(
      df_follows2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "FOLLOWS"
      )
   #  ACTEDIN
   #
   df_importer.edges_dataframe(
      df_actedin2[["START_ID", "END_ID", "TYPE", "ROLES"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "ACTEDIN"
      )

   df_importer.execute()
    
    

In [None]:

l_result1 = my_graph.query("""

   MATCH (a) 
   WITH DISTINCT LABELS(a) AS temp, COUNT(a) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   
   """)

display(print(l_result1))


In [None]:

l_result1 = my_graph.query("""

   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS temp, COUNT(r) AS tempCnt
   RETURN temp, tempCnt
   ORDER BY temp

   """)

display(print(l_result1))


In [None]:

result = my_graph.query("""

   MATCH (x) -[r]-> (a)
   RETURN x, r AS rel, a
   
   """,
   contextualize=True)

result.view()


# Output a graph as a a local file

In [None]:

#  Formatting could use a little work, but the concept is here ..

l_result1 = my_graph.query("""
   MATCH (n: Person) 
   RETURN n
   """)
      #
# display(print(l_result1))

l_result2 = my_graph.query("""
   MATCH (n) - [r: WROTE] -> (m)
   RETURN r
   """)
      #
# display(print(l_result2))


In [None]:

l_nodes = []
   #
for l_node in l_result1.iterrows():
   l_nodes.append(l_node)

l_file = open("nodes.txt", "w")
l_file.write(str(l_nodes))
l_file.close()


l_edges = []
   #
for l_edge in l_result2.iterrows():
   l_edges.append(l_edge)

l_file = open("edges.txt", "w")
l_file.write(str(l_edges))
l_file.close()


print("--")
