In [64]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## Data Pre-Processing
- Read in labels and convert to k-encoding thus vector of size 1XK of zeroes with ith index set for ith group
- Read in node embedding, convert to dictionary with key: node, value: vector in embedded space, convert to df
- Sort both dataframes by column. Dimensions of results are **Labels: K X N, Patterns: D X N**

In [65]:
labels = pd.read_csv('group-edges.csv',header=None)
# convert labels to a dict
label_dict = dict()
for k, node in enumerate(labels.iloc[:,0]):
    label_dict[node] =label_dict.get(node, []) + [labels.iloc[k,1]]

# pre-process the label dict to k-encoding
labels_encoded = dict()
for key in label_dict.keys():
    temp_array = np.zeros([39,])
    indices = np.array(label_dict.get(key))-1
    temp_array[indices] = 1
    labels_encoded[key]=temp_array

k_encoded = pd.DataFrame(labels_encoded)
encoded_sorted = k_encoded.reindex(sorted(k_encoded.columns),axis=1)

In [66]:
emd_file = open("blog3.emd", "r+")

first_line = emd_file.readline()
num_of_nodes = first_line.split(" ")[0]
num_of_nodes = int(num_of_nodes)

data = dict()
for node_num in range(num_of_nodes):
    line = emd_file.readline()
    temp_data = []
    for val in line.split(" "):
        temp_data.append(float(val))
    data[int(temp_data[0])]=np.array(temp_data[1:])

patterns = pd.DataFrame(data)
patterns_sorted = patterns.reindex(sorted(patterns.columns),axis=1)

## Data classification and evaluation
- Produce a train-test split of the patterns and labels, converting dimensions to N X D, N X K respectively
- Fit the classifier, then evaluate the Micro and Macro F1 scores

In [67]:
# Extension to list of classifiers ...
seed = 0
clf_list = [DecisionTreeClassifier(random_state=seed),
            KNeighborsClassifier(n_neighbors=3),
            MLPClassifier(random_state=seed, max_iter=500),
            RandomForestClassifier(random_state=seed)]

In [68]:
def evaluate_clf(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    print("Macro f1: ", f1_score(y_test, clf.predict(X_test), average='macro'))
    print("Micro f1: ", f1_score(y_test, clf.predict(X_test), average='micro'))

In [69]:
X_train, X_test, y_train, y_test = train_test_split(patterns_sorted.values.T, encoded_sorted.values.T, 
                                                    test_size=.20,random_state=0)
for clf in clf_list:
    print(clf.__class__.__name__, "\n")
    evaluate_clf(clf, X_train, X_test, y_train, y_test)
    print("\n\n")

DecisionTreeClassifier 



  'recall', 'true', average, warn_for)


Macro f1:  0.09228766961214697
Micro f1:  0.17308347529812607



KNeighborsClassifier 



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Macro f1:  0.12238584795801344
Micro f1:  0.25735687533440343



MLPClassifier 



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Macro f1:  0.20726621584938756
Micro f1:  0.3400247831474597



RandomForestClassifier 

Macro f1:  0.05712354425418981
Micro f1:  0.1379980563654033





  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
