# Part 00: Notebook overview ..

In [None]:

#  This Notebook is part of a set that demonstrate GNN using a movie dataset.
#  About this Notebook,
#
#  .  In NoteBook 20*, we built a Movie graph; movies, keywords
#     In NoteBook 30* we delivered a primer on CountVectorizers, and Classifiers using non-graph
#
#  .  Here we'll apply the non-graph Classifier to our Movie nodes from our graph.
#
#     We'll apply it to just Movie.title, then Movie.tagline, then Movie.overview.
#     And then we'll apply it to all 3 at one time.
#
#     And we'll compare the results to known Movie.genres_primary.
#
#        Recall Movies actually had an array of genres, and we derived genres_primary
#        as the genres in the first position inside the array.
#
#  .  Our graph, our source of data, was prepared/loaded in NoteBook 20*.



#  Part 01: Graph setup, and initial read

In [None]:

#  The KatanaGraph remote API is expected to run from a node external to
#  the Katana Graph cluster itself.
#
#  This differs from the distributed API, which is meant to run primitives
#  on the Katana Graph worker nodes.
#

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph1" 

print("--")


In [None]:

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [32]:

#  Sanity check 
#
display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))


#  Sample output,
#
#     'Number of Graph Nodes: 64857'
#     'Number of Graph Edges: 330988'


          0/? [?op/s]

'Number of Graph Nodes: 64857'

          0/? [?op/s]

'Number of Graph Edges: 330988'

In [33]:

#  Looking at the graph
#
l_result = my_graph.query("""

   MATCH (n) -[r]-> (m)
   RETURN n, r, m
   LIMIT 1000                        //  Limit is 25,000 for visualization, smaller is better
   
   """,
   contextualize=True)

l_result.view()

          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…


<div> 
<img src="./01_Images/10-Movie-Query-1.png" alt="Drawing" style="width: 1600px;"/>
</div>


#  Part 02: Getting data for use by our non-graph classifier

In [None]:

#  We're using a non-graph set of libraries from sklearn. As such, we need to pull
#  the data out of the graph into DataFrames


In [None]:

#  We'll consider these packages to be common knowledge. Else, return to the Compulsaries
#  set of NoteBooks for sample use, introduction ..
#
import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate


   ###
    
    
from sklearn.feature_extraction.text import CountVectorizer            #  Build a sparse vector (matrix) of keywords
from sklearn.feature_extraction.text import TfidfTransformer           #  Tool to norrmalize a condition; words that occur frequently, versus giving more weight to infrequent words
   #                                                                   #        Ie., Having 6 toes is way more rare than having 5. Should I give more weight that a person is 5'10", or has 6 toes ?
from sklearn.naive_bayes import MultinomialNB                          #  A non-graph ML routine, a Classifier  (similar perhaps to node property prediction, but on non-graph)
                                                                       #     Several variants of NaiveBayes, one being 'multi-nomial'.
                                                                       #     'multi-nomial', used commonly for word count style problems
        
from sklearn.pipeline import Pipeline                                  #  Allow us to simplify exuection of many sequential steps

print("--")


In [44]:

l_movies = [ "8469", "11848" ]

l_query  = """

   MATCH (n: Movies) 
   
   WHERE n.id IN [ 0
   // WHERE n.id = "8469"
   // WHERE n.title = 'Animal House'
   
   RETURN n // .title
   LIMIT 5

   """.format(l_movies)

l_result = my_graph.query(l_query)


display(print(l_result[0:5]))
display(l_result.table())






          0/? [?op/s]

OperationError: FWDXqqTtC787CjTR2v9dhrVoqQ6ksgPzytnjiSa89sDN-2sJyFsjJJZSxyXKMq backtrace (QueryOperation.cpp:67): backtrace (Network.h:331): backtrace (QueryOperation.cpp:70): backtrace (QueryClient.cpp:624): opgraph check failed (OpGraph.cpp:458): Type Checking Pass (OpGraphErrorChecker.cpp:12): backtrace (TypeChecker.cpp:1757): backtrace (OpGraph.cpp:716): backtrace (OpGraph.cpp:716): backtrace (OpGraph.cpp:692): backtrace (TypeChecker.cpp:1216): checking func __katana_internal_in_list (TypeChecker.cpp:334): backtrace (TypeChecker.cpp:990): Expected list type for IN, got large_utf8 (ListFunc.cpp:440): TCK = SyntaxError:InvalidArgumentType
Katana = ArgumentTypeError:ExpectedListType: TCK = SyntaxError:InvalidArgumentType
Katana = ArgumentTypeError:ExpectedListType: TCK = SyntaxError:InvalidArgumentType
Katana = ArgumentTypeError:ExpectedListType

In [None]:

print(type(l_result["m.id"]))
print(l_result)


#  Introduction to CountVectorizer

In [None]:

#  Working with  sklearn.feature_extraction.text.CountVectorizer
#

#  .  The results array will be automatically sorted by input key value.
#        ..  So below, baseball will occupy the output's zero'th position,
#            where cricket will occupy the 1'st.
#  .  Words will be automatically split on whitespace, other.
#        ..  So I plan to do my own splitting beforehand, lest I not know what the output array represents.
#               O-Hare          was not split
#               football-helmet was split
#  .  Duplicates do receive multiple entries in the output.
#
#  .  2d integer64 array is output;
#        ..  1st col is a counter, index into array
#        ..  2nd col is for each word in the array, and the reference to 1st col, this word's position in the input
#

#  Just vectorizers to start-
#
my_cv = CountVectorizer()

my_data_arr = [ 
   "baseball",
   "football",
   "cricket",
   "golf",
   "racing",
   "fencing",
   "cricket",
]


my_cv_out = my_cv.fit_transform(my_data_arr)
   #
#  my_cv_out = my_cv.fit_transform(dd_airports.airport_code)
#  my_cv_out = my_cv.fit_transform(dd_airports.airport_name)


print(type(my_cv_out))                     #  <class 'scipy.sparse._csr.csr_matrix'>

                                           #  For my_data_arr above
                                           #  -----------------------------------
print(my_cv_out.shape)                     #  (7, 6)
print(my_cv_out      )                     #  (0, 0)    1
                                           #  (1, 3)    1
                                           #  (2, 1)    1
                                           #  (3, 4)    1
                                           #  (4, 5)    1
                                           #  (5, 2)    1
                    
                                           #  For dd_airports.airport_code above
                                           #  -----------------------------------
                                           #  (5, 5)
                                           #  (0, 2)    1
                                           #  (1, 3)    1
                                           #  (2, 4)    1
                                           #  (3, 1)    1
                                           #  (4, 0)    1
 
                                           #  For dd_airports.airport_name above
                                           #  -----------------------------------
                                           #  (5, 8)
                                           #  (0, 6)    1
                                           #  (1, 1)    1
                                           #  (1, 3)    1
                                           #  (2, 7)    1
                                           #  (2, 4)    1
                                           #  (3, 5)    1
                                           #  (3, 0)    1
                                           #  (4, 2)    1
        
print("")

df_words = pd.DataFrame(my_cv_out.toarray())
   #
for l_each in df_words.iterrows():
   print(l_each)                 


#  For my_data_arr above  (printed as it is output)
# 
#  (0, 0    1 1    0 2    0 3    0 4    0 5    0 Name: 0, dtype: int64)
#  (1, 0    0 1    0 2    0 3    1 4    0 5    0 Name: 1, dtype: int64)
#  (2, 0    0 1    1 2    0 3    0 4    0 5    0 Name: 2, dtype: int64)
#  (3, 0    0 1    0 2    0 3    0 4    1 5    0 Name: 3, dtype: int64)
#  (4, 0    0 1    0 2    0 3    0 4    0 5    1 Name: 4, dtype: int64)
#  (5, 0    0 1    1 2    1 3    0 4    0 5    0 Name: 5, dtype: int64)
#  (6, 0    0 1    1 2    0 3    0 4    0 5    0 Name: 6, dtype: int64)

#  Above better formatted as,
#
#  (0,    0 1    1 0    2 0    3 0   4 0   5 0   Name: 0, dtype: int64)
#  (1,    0 0    1 0    2 0    3 1   4 0   5 0   Name: 1, dtype: int64)
#  (2,    0 0    1 1    2 0    3 0   4 0   5 0   Name: 2, dtype: int64)
#  (3,    0 0    1 0    2 0    3 0   4 1   5 0   Name: 3, dtype: int64)
#  (4,    0 0    1 0    2 0    3 0   4 0   5 1   Name: 4, dtype: int64)
#  (5,    0 0    1 1    2 1    3 0   4 0   5 0   Name: 5, dtype: int64)
#  (6,    0 0    1 1    2 0    3 0   4 0   5 0   Name: 6, dtype: int64)
#
#   A     B C    B C    B C   ......
#
#  So,
#     A   == row number, offset into the array, 0-6 (7) total rows   from my_data_arr
#     B   == col number, offset inside the row, 0-5 (6) unique words from my_data_arr
#     C   ==  1|0  is this keyword  0-5 (6)  found in this row  0-6 (7)
#
#  So, if you had  1000  input records times  20  unique words, the array would be  1000x20
#
#
#  If the value of row-2 was (football, cricket, golf), its entry would appear as,
# 
#  (1,    0 0    1 1    2 0    3 1   4 1   5 0 Name: 1, dtype: int64)
#
#     Recall that the unique keywords sort as; (0)baseball (1)cricket (2)fencing (3)football (4)golf (5)racing


print("--")



#  A further example, first step, training phase

In [None]:

#  Above was just a vectorizer-
#
#  Now, multiple cells, we extend into classifiers. IE., fraud|not-fraud, or multi-value .. ..


#  More complete [ training ] data, as required ..
#
#     .  data[]       -- Our records, emails, nodes in a graph, .. in this case, a simple word
#     .  class[]      -- Our unique list of node labels, for example
#     .  target[]     -- Matches data[] above, the code of the node label.

my_train = {}

my_train["data"] = [                                        #  Just words, imagine these to be fuller records shortly.
   "baseball" ,
   "football" ,
   "cricket"  ,
   "golf"     ,
   "racing"   ,
   "fencing"  ,
      #
   "eggs"     ,
   "bread"    ,
   "cheese"   ,
   "wine"     ,
   ]
   #
my_train["class"] = [                                       #  No direct relation to the array above, yet.
   "sport",
   "food" ,
   ]


my_train["data"].sort()                                     #  We could have entered the data pre-sorted.
my_train["class"].sort()                                    #  Here, just reminding us that we should sort for ease of viewing.


my_train["target"] = [1, 0, 0, 1, 0, 1, 1, 1, 1, 0]         #  These values correspond to those in my_train["data"]; they align by offset into the array
                                                            #  Also, the value of  0,1,n  corrsepond to the position in the my_train["class"] array
    
   ###

    
for l_index, l_zip in enumerate(zip(my_train["data"], my_train["target"])):
   print("Data: %-18s   Class: %s" % (l_zip[0], my_train["class"][l_zip[1]]) )

#  Sample output,
#
#     Data: baseball             Class: sport
#     Data: bread                Class: food
#     Data: cheese               Class: food
#     Data: cricket              Class: sport
#     Data: eggs                 Class: food
#     Data: fencing              Class: sport
#     Data: football             Class: sport
#     Data: golf                 Class: sport
#     Data: racing               Class: sport
#     Data: wine                 Class: food


In [None]:

# #  So we've built the vector before-  Now we do it with new data
#
#  Here we add steps for,
#
#     .  Tf-idf      -- Weighting for words found often (less useful) versus words found rarely (more useful)
#     .  NaiveBayes  -- An ML routine similar to node proprty prediction, before there were graphs

#     **  We will replace all of this cell in the next cell, using a 'pipeline'


my_cv = CountVectorizer()

#  Step 1 of 3 ..
#
#  Old:  Same as we did before/above
#
my_cv_out = my_cv.fit_transform(my_train["data"])
#  print(my_cv_out.shape)


#  Step 2 of 3 ..
#
#  New:  Now normalize fact that longer (documents/Nodes/other) have more words and would get unfair weights
#        term frequency / inverse document frequency
#        (Tf-idf)
#
#        Effectively, reduce the weight of words that occur in more (documents/Nodes/other),
#           in favor of words that occur in fewer (documents/Nodes/other)
#
my_tfidf_train    = TfidfTransformer().fit_transform(my_cv_out)
#  print(my_tfidf_train.shape)


#  Step 3 of 3 ..
#
#  New: 'train' the classifier
#
my_classifier = MultinomialNB().fit(my_tfidf_train, my_train["target"])


print("--")


In [None]:

#  The above had 3 steps, generally always used in sequence.
#  Make this easier; use a pipeline
#

my_pipeline = Pipeline([
   ("cv",     CountVectorizer()  ),
   ("tfidf",  TfidfTransformer() ),
   ("clf",    MultinomialNB()    ),
   ])


#  Here is the pipeline replacing the cell above-
#
#     (So this work was already done, Now we're showing it done more simply.)
#
my_classifier = my_pipeline.fit(my_train["data"], my_train["target"])


print("--")


#  Final piece, scoring (applying)

In [None]:

#  Now we are ready to score (apply) the model.
#


#  Here is the test data
#
my_test = {}
   #
my_test["data"] = [ 
   "I hate baseball" ,
   "I like wine"     ,
   "eggs with cheese",
   "eggs with cheese football",
   "football with eggs and cheese",
   "Wisconsin, America's Dairyland",
   ]


#  Applying/scoring the model
#
my_result = my_classifier.predict(my_test["data"])



#  Recall that the sorted class array is,
#
#     my_train["class"] = [ "food" , "sport", ]


my_test["target"] = [1, 0, 0, 0, 0, 0]                                     #  We only need this to gauge accuracy below, and the last one is wrong



display("Average accuracy: %f" % (np.mean(my_result == my_test["target"])) )
   #
for l_index, l_zip in enumerate(zip(my_test["data"], my_test["target"])):
   print("Data: %-36s   Correct Class: %-10s   Predict Class: %-10s" % (l_zip[0], my_train["class"][l_zip[1]], my_train["class"][my_result[l_index]]) )

#  Sample output,
#
#     'Average accuracy: 0.833333'
#
#     Data: I hate baseball                        Correct Class: sport        Predict Class: sport     
#     Data: I like wine                            Correct Class: food         Predict Class: food      
#     Data: eggs with cheese                       Correct Class: food         Predict Class: food      
#     Data: eggs with cheese football              Correct Class: food         Predict Class: food      
#     Data: football with eggs and cheese          Correct Class: food         Predict Class: food      
#     Data: Wisconsin, America's Dairyland         Correct Class: food         Predict Class: sport     

