# Part 00: Notebook overview ..

In [None]:

#  This Notebook is part of a set that demonstrate GNN using a movie dataset.
#  About this Notebook,
#
#  .  In NoteBook 20*, we built a Movie graph; movies, keywords
#     In NoteBook 30* we delivered a primer on CountVectorizers, and Classifiers using non-graph
#
#  .  Here we'll apply the non-graph Classifier to our Movie nodes from our graph.
#
#     We'll apply it to just Movie.title, then Movie.tagline, then Movie.overview.
#     And then we'll apply it to all 3 at one time.
#
#     And we'll compare the results to known Movie.genres_primary.
#
#        Recall Movies actually had an array of genres, and we derived genres_primary
#        as the genres in the first position inside the array.
#
#  .  Our graph, our source of data, was prepared/loaded in NoteBook 20*.



#  Part 01: Graph setup, and initial read

In [None]:

#  The KatanaGraph remote API is expected to run from a node external to
#  the Katana Graph cluster itself.
#
#  This differs from the distributed API, which is meant to run primitives
#  on the Katana Graph worker nodes.
#

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph1" 

print("--")


In [None]:

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:

#  Sanity check 
#
display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))


#  Sample output,
#
#     'Number of Graph Nodes: 64857'
#     'Number of Graph Edges: 330988'


In [None]:

#  Looking at the graph
#
l_result = my_graph.query("""

   MATCH (n) -[r]-> (m)
   RETURN n, r, m
   LIMIT 1000                        //  Limit is 25,000 for visualization, smaller is better
   
   """,
   contextualize=True)

l_result.view()


<div> 
<img src="./01_Images/10-Movie-Query-1.png" alt="Drawing" style="width: 1600px;"/>
</div>


#  Part 02: Getting data for use by our non-graph classifier

In [None]:

#  We're using a non-graph set of libraries from sklearn. As such, we need to pull
#  the data out of the graph into DataFrames


In [None]:

#  We'll consider these packages to be common knowledge. Else, return to the Compulsaries
#  set of NoteBooks for sample use, introduction ..
#
import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate


   ###
    
    
from sklearn.feature_extraction.text import CountVectorizer            #  Build a sparse vector (matrix) of keywords
from sklearn.feature_extraction.text import TfidfTransformer           #  Tool to norrmalize a condition; words that occur frequently, versus giving more weight to infrequent words
   #                                                                   #        Ie., Having 6 toes is way more rare than having 5. Should I give more weight that a person is 5'10", or has 6 toes ?
from sklearn.naive_bayes import MultinomialNB                          #  A non-graph ML routine, a Classifier  (similar perhaps to node property prediction, but on non-graph)
                                                                       #     Several variants of NaiveBayes, one being 'multi-nomial'.
                                                                       #     'multi-nomial', used commonly for word count style problems
        
from sklearn.pipeline import Pipeline                                  #  Allow us to simplify exuection of many sequential steps

print("--")


In [34]:

#  These will be our test movies
#     (Previously we cast movie.id as a string, not an integer; no reason for not doing that.)
#

l_test_movie_ids = [
   "8469"    ,      #  Animal House
   "11848"   ,      #  Animal Farm
   "600"     ,      #  Full Metal Jacket
   "13342"   ,      #  Fast Times at Ridgemont High
   "10373"   ,      #  Quadrophenia
   "62"      ,      #  2001: A Space Odyssey
   "14328"   ,      #  The Paper Chase
   "11589"   ,      #  Kelly's Heroes
   "694"     ,      #  The Shining
   "424"     ,      #  Schindler's List
   ]

l_query  = """
   MATCH (n: Movies) 
   WHERE n.id IN {0}
   RETURN n.id AS id, n.title AS title, n.genres_primary AS genres, n.genres_primary_id AS genres_id, n.tagline AS tagline, n.overview AS overview
   ORDER BY n.title
   """.format(l_test_movie_ids)

l_test_movies = my_graph.query(l_query)
   #
#  print(tabulate(l_test_movies, headers='keys', tablefmt='psql'))
display(l_test_movies.table())


          0/? [?op/s]

GridBox(children=(HBox(children=(Text(value='', placeholder='Search...'), Label(value='Count: 10 rows'), HBox(…

In [31]:

#  Go get our train movies
#
#     We run both (all movies) and "NOT () IN" movies just for a check-
#


#  This is not one we want; just counting to be sure-
#
l_query  = """
   MATCH (n: Movies) 
   RETURN n.id AS id, n.title AS title, n.genres_primary AS genres, n.genres_primary_id AS genres_id, n.tagline AS tagline, n.overview AS overview
   """.format(l_test_movie_ids)
      #
l_train_movies = my_graph.query(l_query)
   #
print(len(l_train_movies))


#  This is the one we want moving forward
#
l_query  = """
   MATCH (n: Movies) 
   WHERE NOT n.id IN {0}
   RETURN n.id AS id, n.title AS title, n.genres_primary AS genres, n.genres_primary_id AS genres_id, n.tagline AS tagline, n.overview AS overview
   """.format(l_test_movie_ids)
      #
l_train_movies = my_graph.query(l_query)
   #
print(len(l_train_movies))

print(tabulate(l_train_movies.head(5), headers='keys', tablefmt='psql'))
#  display(l_train_movies.table())


#  Sample output,
#
#     45433
#     45423


          0/? [?op/s]

45433


          0/? [?op/s]

45423
+----+----------+-------------+--------+---------------------+
|    | genres   |   genres_id |     id | title               |
|----+----------+-------------+--------+---------------------|
|  0 | Family   |       10751 | 291854 | Meet the Mormons    |
|  1 | Comedy   |          35 |  24632 | The Mad             |
|  2 | Comedy   |          35 |  11860 | Sabrina             |
|  3 | Comedy   |          35 |  40774 | Midgets Vs. Mascots |
|  4 | Action   |          28 |  45325 | Tom and Huck        |
+----+----------+-------------+--------+---------------------+


# Part 03:  Do the training

In [None]:

#  Just train on movie.title to begin with  (this should fail badly)
#
#     Reminder:  the steps below were detailed in NoteBook 30*
#

my_pipeline = Pipeline([
   ("cv",     CountVectorizer()  ),
   ("tfidf",  TfidfTransformer() ),
   ("clf",    MultinomialNB()    ),
   ])


my_classifier = my_pipeline.fit(my_train_movies["data"], my_train_movies["target"])


print("--")


#  Final piece, scoring (applying)

In [None]:

#  Now we are ready to score (apply) the model.
#


#  Here is the test data
#
my_test = {}
   #
my_test["data"] = [ 
   "I hate baseball" ,
   "I like wine"     ,
   "eggs with cheese",
   "eggs with cheese football",
   "football with eggs and cheese",
   "Wisconsin, America's Dairyland",
   ]


#  Applying/scoring the model
#
my_result = my_classifier.predict(my_test["data"])



#  Recall that the sorted class array is,
#
#     my_train["class"] = [ "food" , "sport", ]


my_test["target"] = [1, 0, 0, 0, 0, 0]                                     #  We only need this to gauge accuracy below, and the last one is wrong



display("Average accuracy: %f" % (np.mean(my_result == my_test["target"])) )
   #
for l_index, l_zip in enumerate(zip(my_test["data"], my_test["target"])):
   print("Data: %-36s   Correct Class: %-10s   Predict Class: %-10s" % (l_zip[0], my_train["class"][l_zip[1]], my_train["class"][my_result[l_index]]) )

#  Sample output,
#
#     'Average accuracy: 0.833333'
#
#     Data: I hate baseball                        Correct Class: sport        Predict Class: sport     
#     Data: I like wine                            Correct Class: food         Predict Class: food      
#     Data: eggs with cheese                       Correct Class: food         Predict Class: food      
#     Data: eggs with cheese football              Correct Class: food         Predict Class: food      
#     Data: football with eggs and cheese          Correct Class: food         Predict Class: food      
#     Data: Wisconsin, America's Dairyland         Correct Class: food         Predict Class: sport     

