# Notebook for Testing K-NN on English Letter Dataset

In [1]:
# standard library
import os
import json
from collections import Counter
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

# 3rd party library
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_score, recall_score


# local classes
os.chdir('../')
from TraversalDistance.Graph import Graph
from TraversalDistance.FreeSpaceGraph import FreeSpaceGraph
from TraversalDistance.KNeighborsClassifier import KNeighborsClassifier
os.chdir('letter_data')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Loading Files into Graph Classes

In [2]:
def json_to_graph(json_data):    
    graph = Graph()
    graph.name = json_data['gxl']['graph'][0]['$']['id'][0]
    
    # Extract X, Y coordinates from 'node' elements.
    for node_data in json_data['gxl']['graph'][0]['node']:
        node_id = int(node_data['$']['id'][1:])
        x_coord = float(node_data['attr'][0]['float'][0])
        y_coord = float(node_data['attr'][1]['float'][0])
        graph.addNode(node_id, x_coord, y_coord)

    # Extract edges from 'edge' elements.
    for i, edge_data in enumerate(json_data['gxl']['graph'][0]['edge'], 1):
        from_node = int(edge_data['$']['from'][1:])
        to_node = int(edge_data['$']['to'][1:])
        graph.connectTwoNodes(i, from_node, to_node)
        
    return graph, graph.name

# Check if input correct return True for 0 distance between two identical graphs.
def is_valid(json_graph):
    g1, n1 = json_to_graph(json_graph)
    g1.id = 0
    g2, n2 = json_to_graph(json_graph)
    g2.id = 1
    fsg = FreeSpaceGraph(g1, g2, 0.001)
    check = fsg.DFSTraversalDist()
    return check

# Generates dataset inputs and labels for machine learning. 
# Filters labels to include in dataset. 
def graph_data():
    file_names = os.listdir("LOW")
    X, y = list(), list()

    for index, file_name in enumerate(file_names):
        if file_name.endswith('.json'):
            try:         
                json_graph = json.load(open(f"LOW/{file_name}"))
                graph, name = json_to_graph(json_graph)
                
                if  is_valid(json_graph):
                    graph.id = index         
                    X.append(graph)
                    y.append(name)
                else:
                    print(f"Distance failed to compute {file_name}.")
                    
                                        
            except Exception as error: 
                print(f"AssertionError {error}: Fail to parse {file_name}.")
                
    return X, y

### Creating Test/Train Split Datasets

In [3]:
X, y = graph_data()

N, SAMPLE = 100, True

if SAMPLE: 
    X, y = X[:N], y[:N]

AssertionError 'edge': Fail to parse VP1_0125.json.
AssertionError 'edge': Fail to parse LP1_0103.json.
AssertionError 'gxl': Fail to parse test.json.
AssertionError 'edge': Fail to parse LP1_0136.json.
AssertionError 'edge': Fail to parse LP1_0086.json.
AssertionError 'edge': Fail to parse LP1_0068.json.
AssertionError 'edge': Fail to parse VP1_0086.json.
AssertionError 'gxl': Fail to parse validation.json.
AssertionError 'gxl': Fail to parse train.json.
AssertionError 'edge': Fail to parse IP1_0110.json.


In [4]:
print("Classes:")
for key , val in Counter(y).items():
    print(" ", key, ":", val)

Classes:
  T : 9
  A : 7
  X : 7
  H : 7
  Y : 9
  I : 8
  E : 8
  N : 7
  L : 6
  M : 5
  V : 6
  Z : 5
  F : 6
  W : 4
  K : 6


### Creating k-fold

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

### Initializing Traversal Distance K-NN  

###  Model K fold test

In [6]:
def k_fold_test(X_train, X_test, y_train, y_test, fold):
    print(" \n *** Stating Fold Test #:", fold, "Train len:", len(y_train), "Test len:", len(y_test), "***")

    model = KNeighborsClassifier(n_neighbors=7, mean='max', left=0, right=3, precision=0.001)  

    model.fit(X_train, y_train)
    y_pred, log = model.predict(X_test, fold=fold)
    
    filename = f'logs/knn_log_04_25_fold_{fold}.csv'

    with open(filename, 'w') as f:
        for i, (y_hat, n_classifications) in enumerate(log):
            row = [y_test[i], y_hat] + n_classifications
            f.write(','.join(row) + '\n')
    
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
        
    return precision, recall


### Running Test

In [7]:
scores = []

for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
        
    precision, recall = k_fold_test(X_train, X_test, y_train, y_test, fold+1)
    scores.append((precision, recall))
    
df = pd.DataFrame(scores, columns=["precision", "recall"])
df.to_csv('logs/knn_log_04_25.csv', index=True)

 
 *** Stating Fold Test #: 1 Train len: 80 Test len: 20 ***
    Fold 1 - Observation 0 of 20
    Fold 1 - Observation 1 of 20
    Fold 1 - Observation 2 of 20
    Fold 1 - Observation 3 of 20
    Fold 1 - Observation 4 of 20
    Fold 1 - Observation 5 of 20
    Fold 1 - Observation 6 of 20
    Fold 1 - Observation 7 of 20
    Fold 1 - Observation 8 of 20
    Fold 1 - Observation 9 of 20
    Fold 1 - Observation 10 of 20
    Fold 1 - Observation 11 of 20
    Fold 1 - Observation 12 of 20
    Fold 1 - Observation 13 of 20
    Fold 1 - Observation 14 of 20
    Fold 1 - Observation 15 of 20
    Fold 1 - Observation 16 of 20
    Fold 1 - Observation 17 of 20
    Fold 1 - Observation 18 of 20
    Fold 1 - Observation 19 of 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 
 *** Stating Fold Test #: 2 Train len: 80 Test len: 20 ***
    Fold 2 - Observation 0 of 20
    Fold 2 - Observation 1 of 20
    Fold 2 - Observation 2 of 20
    Fold 2 - Observation 3 of 20
    Fold 2 - Observation 4 of 20
    Fold 2 - Observation 5 of 20
    Fold 2 - Observation 6 of 20
    Fold 2 - Observation 7 of 20
    Fold 2 - Observation 8 of 20
    Fold 2 - Observation 9 of 20
    Fold 2 - Observation 10 of 20
    Fold 2 - Observation 11 of 20
    Fold 2 - Observation 12 of 20
    Fold 2 - Observation 13 of 20
    Fold 2 - Observation 14 of 20
    Fold 2 - Observation 15 of 20
    Fold 2 - Observation 16 of 20
    Fold 2 - Observation 17 of 20
    Fold 2 - Observation 18 of 20
    Fold 2 - Observation 19 of 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 
 *** Stating Fold Test #: 3 Train len: 80 Test len: 20 ***
    Fold 3 - Observation 0 of 20
    Fold 3 - Observation 1 of 20
    Fold 3 - Observation 2 of 20
    Fold 3 - Observation 3 of 20
    Fold 3 - Observation 4 of 20
    Fold 3 - Observation 5 of 20
    Fold 3 - Observation 6 of 20
    Fold 3 - Observation 7 of 20
    Fold 3 - Observation 8 of 20
    Fold 3 - Observation 9 of 20
    Fold 3 - Observation 10 of 20
    Fold 3 - Observation 11 of 20
    Fold 3 - Observation 12 of 20
    Fold 3 - Observation 13 of 20
    Fold 3 - Observation 14 of 20
    Fold 3 - Observation 15 of 20
    Fold 3 - Observation 16 of 20
    Fold 3 - Observation 17 of 20
    Fold 3 - Observation 18 of 20
    Fold 3 - Observation 19 of 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 
 *** Stating Fold Test #: 4 Train len: 80 Test len: 20 ***
    Fold 4 - Observation 0 of 20
    Fold 4 - Observation 1 of 20
    Fold 4 - Observation 2 of 20
    Fold 4 - Observation 3 of 20
    Fold 4 - Observation 4 of 20
    Fold 4 - Observation 5 of 20
    Fold 4 - Observation 6 of 20
    Fold 4 - Observation 7 of 20
    Fold 4 - Observation 8 of 20
    Fold 4 - Observation 9 of 20
    Fold 4 - Observation 10 of 20
    Fold 4 - Observation 11 of 20
    Fold 4 - Observation 12 of 20
    Fold 4 - Observation 13 of 20
    Fold 4 - Observation 14 of 20
    Fold 4 - Observation 15 of 20
    Fold 4 - Observation 16 of 20
    Fold 4 - Observation 17 of 20
    Fold 4 - Observation 18 of 20
    Fold 4 - Observation 19 of 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 
 *** Stating Fold Test #: 5 Train len: 80 Test len: 20 ***
    Fold 5 - Observation 0 of 20
    Fold 5 - Observation 1 of 20
    Fold 5 - Observation 2 of 20
    Fold 5 - Observation 3 of 20
    Fold 5 - Observation 4 of 20
    Fold 5 - Observation 5 of 20
    Fold 5 - Observation 6 of 20
    Fold 5 - Observation 7 of 20
    Fold 5 - Observation 8 of 20
    Fold 5 - Observation 9 of 20
    Fold 5 - Observation 10 of 20
    Fold 5 - Observation 11 of 20
    Fold 5 - Observation 12 of 20
    Fold 5 - Observation 13 of 20
    Fold 5 - Observation 14 of 20
    Fold 5 - Observation 15 of 20
    Fold 5 - Observation 16 of 20
    Fold 5 - Observation 17 of 20
    Fold 5 - Observation 18 of 20
    Fold 5 - Observation 19 of 20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
