# Notebook for Testing K-NN on English Letter Dataset

In [1]:
# standard library
import os
import json

# 3rd party library
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# local classes
os.chdir('../')
from TraversalDistance.Graph import Graph
from TraversalDistance.FreeSpaceGraph import FreeSpaceGraph
from TraversalDistance.KNeighborsClassifier import KNeighborsClassifier
os.chdir('letter_data')

### Loading Files into Graph Classes

In [2]:
def json_to_graph(json_data):    
    graph = Graph()
    graph.name = json_data['gxl']['graph'][0]['$']['id'][0]
    
    # Extract X, Y coordinates from 'node' elements.
    for node_data in json_data['gxl']['graph'][0]['node']:
        node_id = int(node_data['$']['id'][1:])
        x_coord = float(node_data['attr'][0]['float'][0])
        y_coord = float(node_data['attr'][1]['float'][0])
        graph.addNode(node_id, x_coord, y_coord)

    # Extract edges from 'edge' elements.
    for i, edge_data in enumerate(json_data['gxl']['graph'][0]['edge'], 1):
        from_node = int(edge_data['$']['from'][1:])
        to_node = int(edge_data['$']['to'][1:])
        graph.connectTwoNodes(i, from_node, to_node)
        
    return graph, graph.name

# Check if input correct return True for 0 distance between two identical graphs.
def is_valid(json_graph):
    g1, n1 = json_to_graph(json_graph)
    g1.id = 0
    g2, n2 = json_to_graph(json_graph)
    g1.id = 1
    fsg = FreeSpaceGraph(g1, g2, 0.001)
    check = fsg.DFSTraversalDist()
    return check

# Generates dataset inputs and labels for machine learning. 
# Filters labels to include in dataset. 
def graph_data(letters):
    file_names = os.listdir("LOW")
    X, y = list(), list()

    for index, file_name in enumerate(file_names):
        if file_name.endswith('.json'):
            try:         
                json_graph = json.load(open(f"LOW/{file_name}"))
                
                if is_valid(json_graph=json_graph):
                    graph, name = json_to_graph(json_graph)
                    graph.id = index         
                    
                    # Sampling n number of classes. 
                    if name in letters: 
                        X.append(graph)
                        y.append(name)
                    
            except Exception as error: 
                print(f"AssertionError {error}: Fail to parse {file_name}.")
                
    return X, y

### Creating Test/Train Split Datasets

In [3]:
letters = ['N', 'L', 'M']
X, y = graph_data(letters)

AssertionError float division by zero: Fail to parse TP1_0107.json.
AssertionError float division by zero: Fail to parse AP1_0028.json.
AssertionError float division by zero: Fail to parse XP1_0130.json.
AssertionError float division by zero: Fail to parse HP1_0136.json.
AssertionError float division by zero: Fail to parse EP1_0077.json.
AssertionError float division by zero: Fail to parse EP1_0132.json.
AssertionError math domain error: Fail to parse TP1_0042.json.
AssertionError float division by zero: Fail to parse HP1_0073.json.
AssertionError float division by zero: Fail to parse XP1_0075.json.
AssertionError math domain error: Fail to parse NP1_0043.json.
AssertionError math domain error: Fail to parse EP1_0020.json.
AssertionError math domain error: Fail to parse AP1_0090.json.
AssertionError float division by zero: Fail to parse HP1_0024.json.
AssertionError float division by zero: Fail to parse TP1_0015.json.
AssertionError float division by zero: Fail to parse ZP1_0108.json.


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

print('Train count:', len(X_train))
print('Train count:', len(X_test))

Train count: 367
Train count: 41


### Initializing and Fitting Traversal Distance K-NN  

In [5]:
model = KNeighborsClassifier(n_neighbors=5, mean='arithmetic', left=0, right=5, precision=0.01)
model.fit(X_train, y_train)

### Observing Model Precision

In [6]:
y_pred = model.predict(X_test)


*** Computing Prediction ***
Nearest Classifications: ['L', 'L', 'L', 'L', 'L']
Predicted Classification: L

*** Computing Prediction ***
Nearest Classifications: ['M', 'M', 'M', 'M', 'M']
Predicted Classification: M

*** Computing Prediction ***
Nearest Classifications: ['L', 'L', 'L', 'L', 'L']
Predicted Classification: L

*** Computing Prediction ***
Nearest Classifications: ['N', 'N', 'N', 'N', 'N']
Predicted Classification: N

*** Computing Prediction ***
Nearest Classifications: ['L', 'L', 'L', 'L', 'L']
Predicted Classification: L

*** Computing Prediction ***
Nearest Classifications: ['L', 'L', 'L', 'L', 'L']
Predicted Classification: L

*** Computing Prediction ***
Nearest Classifications: ['N', 'N', 'N', 'N', 'N']
Predicted Classification: N

*** Computing Prediction ***
Nearest Classifications: ['M', 'M', 'M', 'M', 'M']
Predicted Classification: M

*** Computing Prediction ***
Nearest Classifications: ['L', 'L', 'L', 'L', 'L']
Predicted Classification: L

*** Computing Pred

In [7]:
#precision = precision_score(y_test, y_pred, pos_label=letters[0])
#recall = recall_score(y_test, y_pred, pos_label=letters[0])

precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")

Precision: 1.0
Recall: 1.0
