# Notebook for Testing K-NN on English Letter Dataset

In [1]:
# standard library
import os
import json
from collections import Counter

# 3rd party library
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score


# local classes
os.chdir('../')
from TraversalDistance.Graph import Graph
from TraversalDistance.FreeSpaceGraph import FreeSpaceGraph
from TraversalDistance.KNeighborsClassifier import KNeighborsClassifier
os.chdir('letter_data')

### Loading Files into Graph Classes

In [2]:
def json_to_graph(json_data):    
    graph = Graph()
    graph.name = json_data['gxl']['graph'][0]['$']['id'][0]
    
    # Extract X, Y coordinates from 'node' elements.
    for node_data in json_data['gxl']['graph'][0]['node']:
        node_id = int(node_data['$']['id'][1:])
        x_coord = float(node_data['attr'][0]['float'][0])
        y_coord = float(node_data['attr'][1]['float'][0])
        graph.addNode(node_id, x_coord, y_coord)

    # Extract edges from 'edge' elements.
    for i, edge_data in enumerate(json_data['gxl']['graph'][0]['edge'], 1):
        from_node = int(edge_data['$']['from'][1:])
        to_node = int(edge_data['$']['to'][1:])
        graph.connectTwoNodes(i, from_node, to_node)
        
    return graph, graph.name

# Check if input correct return True for 0 distance between two identical graphs.
def is_valid(json_graph):
    g1, n1 = json_to_graph(json_graph)
    g1.id = 0
    g2, n2 = json_to_graph(json_graph)
    g2.id = 1
    fsg = FreeSpaceGraph(g1, g2, 0.001)
    check = fsg.DFSTraversalDist()
    return check

# Generates dataset inputs and labels for machine learning. 
# Filters labels to include in dataset. 
def graph_data():
    file_names = os.listdir("LOW")
    X, y = list(), list()

    for index, file_name in enumerate(file_names):
        if file_name.endswith('.json'):
            try:         
                json_graph = json.load(open(f"LOW/{file_name}"))
                graph, name = json_to_graph(json_graph)
                
                if  is_valid(json_graph):
                    graph.id = index         
                    X.append(graph)
                    y.append(name)
                else:
                    print(f"Distance failed to compute {file_name}.")
                    
                                        
            except Exception as error: 
                print(f"AssertionError {error}: Fail to parse {file_name}.")
                
    return X, y

### Creating Test/Train Split Datasets

In [3]:
X, y = graph_data()

AssertionError 'edge': Fail to parse VP1_0125.json.
AssertionError 'edge': Fail to parse LP1_0103.json.
AssertionError 'gxl': Fail to parse test.json.
AssertionError 'edge': Fail to parse LP1_0136.json.
AssertionError 'edge': Fail to parse LP1_0086.json.
AssertionError 'edge': Fail to parse LP1_0068.json.
AssertionError 'edge': Fail to parse VP1_0086.json.
AssertionError 'gxl': Fail to parse validation.json.
AssertionError 'gxl': Fail to parse train.json.
AssertionError 'edge': Fail to parse IP1_0110.json.


In [4]:
print("Classes:")
for key , val in Counter(y).items():
    print(" ", key, ":", val)

Classes:
  T : 150
  A : 150
  X : 150
  H : 150
  Y : 150
  I : 149
  E : 150
  N : 150
  L : 146
  M : 150
  V : 148
  Z : 150
  F : 150
  W : 150
  K : 150


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print('Train count:', len(X_train))
print('Train count:', len(X_test))

Train count: 1794
Train count: 449


### Sampling Dataset (for testing)

In [6]:
n_train, n_test, sample = 50, 50, True

if sample: 
    X_train, X_test, y_train, y_test = X_train[:n_train], X_test[:n_test], y_train[:n_train], y_test[:n_test]

### Initializing and Fitting Traversal Distance K-NN  

In [7]:
model = KNeighborsClassifier(n_neighbors=30, mean='max', left=0, right=5, precision=0.01)
model.fit(X_train, y_train)

### Observing Model Precision

In [8]:
y_pred, log = model.predict(X_test)

50


AttributeError: Can't pickle local object 'KNeighborsClassifier.predict.<locals>.task'

### Saving Predictions. 

In [None]:
filename = 'knn_log.csv'

with open(filename, 'w') as f:
    for i, (y_hat, n_classifications) in enumerate(log):
        row = [y_test[i], y_hat] + n_classifications
        f.write(','.join(row) + '\n')

In [None]:
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")

Precision: 0.03111111111111111
Recall: 0.15333333333333332


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
