# Part 00: Notebook overview ..

In [1]:

#  This Notebook is part of a set that demonstrate GNN using a movie dataset.
#  About this Notebook,
#
#  .  In NoteBook 20*, we built a Movie graph; movies, keywords
#     In NoteBook 30* we delivered a primer on CountVectorizers, and Classifiers using non-graph.
#
#  .  Here we'll apply the non-graph Classifier to our Movie nodes from our graph.
#
#     We'll apply it to just Movie.title, then Movie.tagline, then Movie.overview.
#     And then we'll apply it to all 3 at one time.
#
#     And we'll compare the results to known Movie.genres_primary.
#
#        Recall Movies actually had an array of genres, and we derived genres_primary
#        as the genres in the first position inside the array.
#
#  .  Our graph, our source of data, was prepared/loaded in NoteBook 20*.



#  Part 01: Graph setup, and initial read

In [2]:

#  Setting display options, and a flag for outputting more information
#

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

from tabulate import tabulate


MY_DEBUG = True
# MY_DEBUG = False
   #
print("--")


--


In [3]:

#  The KatanaGraph remote API is expected to run from a node external to
#  the Katana Graph cluster itself.
#
#  This differs from the distributed API, which is meant to run primitives
#  on the Katana Graph worker nodes.
#

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f97e025e220>


In [4]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph1" 

print("--")


--


In [5]:

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


<_Graph my_graph1, D2mcem268HLBLEFiDGxRSgCH3Yq5jgjyhFCLppVGKPRC, 2>


In [6]:

#  Sanity check 
#
display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))


#  Sample output,
#
#     'Number of Graph Nodes: 64857'
#     'Number of Graph Edges: 330988'


          0/? [?op/s]

'Number of Graph Nodes: 64856'

          0/? [?op/s]

'Number of Graph Edges: 330988'

In [7]:

#  Looking at the graph
#
l_result = my_graph.query("""

   MATCH (n) -[r]-> (m)
   RETURN n, r, m
   LIMIT 1000                        //  Limit is 25,000 for visualization, smaller is better
   
   """,
   contextualize=True)

l_result.view()

          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…


<div> 
<img src="./01_Images/10-Movie-Query-1.png" alt="Drawing" style="width: 1600px;"/>
</div>


#  Part 02: Getting data for use by our non-graph classifier

In [8]:

#  We're using a non-graph set of libraries from sklearn. As such, we need to pull
#  the data out of the graph into DataFrames


In [9]:

#  We'll consider these packages to be common knowledge. Else, return to the Compulsaries
#  set of NoteBooks for sample use, introduction ..
#
import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

   ###
    
from sklearn.feature_extraction.text import CountVectorizer            #  Build a sparse vector (matrix) of keywords
from sklearn.feature_extraction.text import TfidfTransformer           #  Tool to norrmalize a condition; words that occur frequently, versus giving more weight to infrequent words
   #                                                                   #        Ie., Having 6 toes is way more rare than having 5. Should I give more weight that a person is 5'10", or has 6 toes ?
from sklearn.naive_bayes import MultinomialNB                          #  A non-graph ML routine, a Classifier  (similar perhaps to node property prediction, but on non-graph)
                                                                       #     Several variants of NaiveBayes, one being 'multi-nomial'.
                                                                       #     'multi-nomial', used commonly for word count style problems
        
from sklearn.pipeline import Pipeline                                  #  Allow us to simplify exuection of many sequential steps

print("--")


--





In [10]:

#  These will be our test movies
#     (Previously we cast movie.id as a string, not an integer; no reason for not doing that.)
#

l_movie_ids_test = [
   "8469"    ,      #  Animal House
   "11848"   ,      #  Animal Farm
   "600"     ,      #  Full Metal Jacket
   "13342"   ,      #  Fast Times at Ridgemont High
   "10373"   ,      #  Quadrophenia
   "62"      ,      #  2001: A Space Odyssey
   "14328"   ,      #  The Paper Chase                            <-- Fyi, labeled as a Comedy and a Drama, with comedy in the first position
   "11589"   ,      #  Kelly's Heroes
   "694"     ,      #  The Shining
   "424"     ,      #  Schindler's List
   ]

l_query  = """
   MATCH (n: Movies) 
   WHERE n.id IN {0}
   RETURN n.id AS id, n.title AS title, n.genres_primary AS genres, n.genres_primary_id AS genres_id, n.tagline AS tagline, n.overview AS overview
   ORDER BY n.title
   """.format(l_movie_ids_test)

df_movies_test = my_graph.query(l_query)


if (MY_DEBUG):
   print(tabulate(df_movies_test, headers='keys', tablefmt='psql'))
   #
   #  Alternate to the above
   #
   #  display(df_movies_test.table())

   #  Displaying programmatically
   #
   print("")
   print("")
   for l_each in df_movies_test.itertuples():
      print("%8s   %-32s   %-64s" % (l_each.id, l_each.title, l_each.tagline))


print("--")

#  Sample output,
#
#     +----+-------+------------------------------+-----------------+-------------+------------------------------------------------------------------------
#     |    |    id | title                        | genres          |   genres_id | tagline                                                       
#     |----+-------+------------------------------+-----------------+-------------+----------------------------------------------------------------
#     |  0 |    62 | 2001: A Space Odyssey        | Science Fiction |         878 | An epic drama of adventure and exploration                           
#     |  1 | 11848 | Animal Farm                  | Animation       |          16 | He's got the world in an UPROAR!                                
#     |  2 |  8469 | Animal House                 | Comedy          |          35 | It was the Deltas against the rules... the rules lost!      
#     |  3 | 13342 | Fast Times at Ridgemont High | Comedy          |          35 | Fast Cars, Fast Girls, Fast Carrots...Fast Carrots?        
#     |  4 |   600 | Full Metal Jacket            | Drama           |          18 | Vietnam can kill me, but it can’t make me care.           
#     |  5 | 11589 | Kelly's Heroes               | Adventure       |          12 | They set out to rob a bank... and damn near won a war ins
#     |  6 | 10373 | Quadrophenia                 | Drama           |          18 | A Way of Life                                               
#     |  7 |   424 | Schindler's List             | Drama           |          18 | Whoever saves one life, saves the world entire.              
#     |  8 | 14328 | The Paper Chase              | Comedy          |          35 | You have to choose between the girl you love and the diploma you've worked for all
#     |  9 |   694 | The Shining                  | Horror          |          27 | A masterpiece of modern horror.                                          
#     +----+-------+------------------------------+-----------------+-------------+--------------------------------------------------------------------------
#     
#     
#           62   2001: A Space Odyssey              An epic drama of adventure and exploration                      
#        11848   Animal Farm                        He's got the world in an UPROAR!                                
#         8469   Animal House                       It was the Deltas against the rules... the rules lost!          
#        13342   Fast Times at Ridgemont High       Fast Cars, Fast Girls, Fast Carrots...Fast Carrots?             
#          600   Full Metal Jacket                  Vietnam can kill me, but it can’t make me care.                 
#        11589   Kelly's Heroes                     They set out to rob a bank... and damn near won a war instead!  
#        10373   Quadrophenia                       A Way of Life                                                   
#          424   Schindler's List                   Whoever saves one life, saves the world entire.                 
#        14328   The Paper Chase                    You have to choose between the girl you love and the diploma you've worked for all your life. You have 30 seconds.
#          694   The Shining                        A masterpiece of modern horror.
    


          0/? [?op/s]

+----+-------+------------------------------+-----------------+-------------+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    |    id | title                        | genres          |   genres_id | tagline                                                                                                            | overview                                                                                                                                           

In [11]:

#  Go get our train movies
#


#  Count of all Movies. By definition, this traversal can only return one row,
#  so we can process the ResultSet differently.
#
l_query  = """
   MATCH (n: Movies)
   RETURN COUNT(*) AS cnt
   """.format()
      #
l_count = my_graph.query(l_query)["cnt"][0]


#  We'll use this DataFrame a lot moving forward.
#
l_query  = """
   MATCH (n: Movies) 
   WHERE NOT n.id IN {0}
   RETURN n.id AS id, n.title AS title, n.genres_primary AS genres, n.genres_primary_id AS genres_id, n.tagline AS tagline, n.overview AS overview
   """.format(l_movie_ids_test)
      #
df_movies_train = my_graph.query(l_query)

if (MY_DEBUG):
   print(tabulate(df_movies_train.head(5), headers='keys', tablefmt='psql'))
   print("")
   print("Count of all Movies: %-10s   Count of Movies in the training data set: %-10d" % ( l_count, len(df_movies_train)) )
    
    
print("--")

#  Sample output,
#
#     +----+--------+------------------+-----------+-------------+------------------------------------------------------+----------------------
#     |    |     id | title            | genres    |   genres_id | tagline                                              | overview            
#     |----+--------+------------------+-----------+-------------+------------------------------------------------------+-----------------
#     |  0 |    862 | Toy Story        | Animation |          16 | Unknown                                              | Led by Woody, Andy's toys live happ
#     |  1 | 291854 | Meet the Mormons | Family    |       10751 | Six ordinary individuals. Six extraordinary stories. | Meet the Mormons examines the ve
#     |  2 |  15602 | Grumpier Old Men | Romance   |       10749 | Still Yelling. Still Fighting. Still Ready for Love. | A family wedding reigni
#     |  3 |  24632 | The Mad          | Comedy    |          35 | Unknown                                              | A horror-thriller in which a d
#     |  4 | 169844 | Malarek          | Action    |          28 | Unknown                                              | Malarek is a film directed by Roge
#     +----+--------+------------------+-----------+-------------+------------------------------------------------------+---------------------------
#     
#     Count of all Movies: 45433        Count of Movies in the training data set: 45423 



          0/? [?op/s]

          0/? [?op/s]

+----+--------+-------------------------------+-----------+-------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    |     id | title      

# Part 03:  Do the training

In [None]:

#  As before, build single structures with all of the data we need.
#  More,
#
#     .  Some of the values we received from the traversal are empty strings, hence,
#        the first block below to populate with a coded value.
#
#     .  The result of the traversals above were  <class 'katana.remote.ResultSet'>
#        This largely functions as a DataFrame, but generally the ML functions
#        want a list(). Hence, to_list()
#
#     .  We'll start with movies.title, since that should yield hideous results.
#        (Compare/contrast as we move forward.)
#


def f_build_model(i_arg1):
   global l_train_movies
   global l_test_movies
    
   #        If you don't do this block, you'll get a,
   #           -->  AttributeError: 'NoneType' object has no attribute 'lower'
   #        when you pass to the pipeline.
   #
   l_train_movies = l_train_movies.assign(
      i_arg1_2  = lambda x: x[i_arg1].fillna("Unknown").astype(str),
      genres2   = lambda x: x.genres.fillna("Unknown").astype(str),
      )
         #
   l_test_movies = l_test_movies.assign(
      i_arg1_2  = lambda x: x[i_arg1].fillna("Unknown").astype(str),
      target2   = lambda x: x.genres.fillna("Unknown").astype(str),
      )
   
   l_train = {}
      #
   my_train["data" ] = l_train_movies["i_arg1_2" ].to_list()
   my_train["class"] = l_train_movies["genres2"].to_list()
      ###
   my_test = {}
      #
   my_test["data"  ] = l_test_movies["i_arg1_2" ].to_list()
   my_test["target"] = l_test_movies["target2"].to_list()
    
   return my_train, my_test
    

f_build_model("title")



if (MY_DEBUG):
   l_cntr = 0
      #
   for l_each in my_train.intertuples():
      l_cntr += 1
         #
      if (l_cntr < 3):
         print(l_each)

    

print("--")


In [None]:

#  Reminder:  the steps below were detailed at length in NoteBook 30*
#
#  This is movies.title, so likely to yeidl poor results.

def f_build_and_score():
    
   %time

   my_pipeline = Pipeline([
      ("cv",     CountVectorizer()  ),
      ("tfidf",  TfidfTransformer() ),
      ("clf",    MultinomialNB()    ),
      ])
   
   my_classifier = my_pipeline.fit(my_train["data"], my_train["class"])
   
   
   my_result = my_classifier.predict(my_test["data"])


f_build_and_score()

print("--")


In [None]:

#  Results
#
def f_output_results():
   display("Average accuracy: %f" % (np.mean(my_result == my_test["target"])) )
      #
   for l_index, l_zip in enumerate(zip(my_test["data"], my_test["target"])):
      print("Data: %-36s   Correct Class: %-18s   Predict Class: %-18s" % (l_zip[0], l_zip[1], my_result[l_index]) )
   
   print("--")
    
    
f_output_results()


#  Sample output,
#
#     'Average accuracy: 0.600000'
#     
#     Data: 2001: A Space Odyssey                  Correct Class: Science Fiction      Predict Class: Drama             
#     Data: Animal Farm                            Correct Class: Animation            Predict Class: Comedy            
#     Data: Animal House                           Correct Class: Comedy               Predict Class: Comedy            
#     Data: Fast Times at Ridgemont High           Correct Class: Comedy               Predict Class: Comedy            
#     Data: Full Metal Jacket                      Correct Class: Drama                Predict Class: Drama             
#     Data: Kelly's Heroes                         Correct Class: Adventure            Predict Class: Drama             
#     Data: Quadrophenia                           Correct Class: Drama                Predict Class: Drama             
#     Data: Schindler's List                       Correct Class: Drama                Predict Class: Drama             
#     Data: The Paper Chase                        Correct Class: Comedy               Predict Class: Comedy            
#     Data: The Shining                            Correct Class: Horror               Predict Class: Comedy 


In [None]:

#  The above was on title. Repeat now for movies.tagline
#

f_build_model("tagline")
   #
f_build_and_score()
   #
f_output_results()


f_build_model("overview")
   #
f_build_and_score()
   #
f_output_results()




In [None]:

my_test = {}
   #
my_test["data"] = [ 
   "I hate baseball" ,
   "I like wine"     ,
   "eggs with cheese",
   "eggs with cheese football",
   "football with eggs and cheese",
   "Wisconsin, America's Dairyland",
   ]


#  Applying/scoring the model
#
my_result = my_classifier.predict(my_test["data"])



#  Recall that the sorted class array is,
#
#     my_train["class"] = [ "food" , "sport", ]


my_test["target"] = [1, 0, 0, 0, 0, 0]                                     #  We only need this to gauge accuracy below, and the last one is wrong



display("Average accuracy: %f" % (np.mean(my_result == my_test["target"])) )
   #
for l_index, l_zip in enumerate(zip(my_test["data"], my_test["target"])):
   print("Data: %-36s   Correct Class: %-10s   Predict Class: %-10s" % (l_zip[0], my_train["class"][l_zip[1]], my_train["class"][my_result[l_index]]) )


#  Final piece, scoring (applying)

In [None]:

#  Now we are ready to score (apply) the model.
#


#  Here is the test data
#
my_test = {}
   #
my_test["data"] = [ 
   "I hate baseball" ,
   "I like wine"     ,
   "eggs with cheese",
   "eggs with cheese football",
   "football with eggs and cheese",
   "Wisconsin, America's Dairyland",
   ]


#  Applying/scoring the model
#
my_result = my_classifier.predict(my_test["data"])



#  Recall that the sorted class array is,
#
#     my_train["class"] = [ "food" , "sport", ]


my_test["target"] = [1, 0, 0, 0, 0, 0]                                     #  We only need this to gauge accuracy below, and the last one is wrong



display("Average accuracy: %f" % (np.mean(my_result == my_test["target"])) )
   #
for l_index, l_zip in enumerate(zip(my_test["data"], my_test["target"])):
   print("Data: %-36s   Correct Class: %-10s   Predict Class: %-10s" % (l_zip[0], my_train["class"][l_zip[1]], my_train["class"][my_result[l_index]]) )

#  Sample output,
#
#     'Average accuracy: 0.833333'
#
#     Data: I hate baseball                        Correct Class: sport        Predict Class: sport     
#     Data: I like wine                            Correct Class: food         Predict Class: food      
#     Data: eggs with cheese                       Correct Class: food         Predict Class: food      
#     Data: eggs with cheese football              Correct Class: food         Predict Class: food      
#     Data: football with eggs and cheese          Correct Class: food         Predict Class: food      
#     Data: Wisconsin, America's Dairyland         Correct Class: food         Predict Class: sport     

