## 1. Data processing
* How to embed data into digit space?
* Because in the testing phase, there is no data about which project is in used, it's straight-forward to merge four sources of data into one.
* With baseline model, I encode item name with one-hot technique which does not reveal any correlation between different items.
* The length of each item is scaled to [-1, 1].
* Direction is one important feature.
* The output is the most challenging part which requires clear and wise encoding strategy. With baseline model, I define 3 different types of elements: beginning of a group, ending of a group and normal. This encoding will allow model to retrieve the least knowledge.

* Training with cross-validation strategy.

In [171]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from sklearn.preprocessing import OneHotEncoder

In [172]:
def read_data(folder_name):
    ## Read data from folder, merge all sources into a list
    files = os.listdir(folder_name)
    data = []
    for file in files:
        with open(os.path.join(folder_name, file)) as f:
            data += json.load(f)
    return data

In [173]:
data = read_data("Locofy_MLE_Challenge_Groupings/Data")

In [174]:
## One-hot encode items
list_of_items = []
for sample in data:
    list_of_items += [element[0] for element in sample["input"]]

In [175]:
## Scale item length
item_length = []
for sample in data:
    for each in sample["input"]:
        item_length.append(each[1])

In [176]:
# Direction encoding
direction = []
for sample in data:
    direction += [sample['direction']] * len(sample['input'])

direction_encoder = preprocessing.LabelEncoder()
direction_encoding = direction_encoder.fit_transform(direction)

In [177]:
# Output encoding
output_encoding = []
for sample in data:
    encoding = []
    sample_output = sample['output']
    sample_input = np.array([item[0] for item in sample['input']])
    indices = np.where(sample_input == '0')[0]
    for each in sample_output:
        if isinstance(each, list) and len(each) > 1:
            encoding.append("start")
            encoding += ["in"] * (len(each) - 2)
            encoding.append("end")
        else:
            encoding.append("out")
    for each in indices:
        encoding.insert(each, "space")
    output_encoding += encoding

In [178]:
sample_ids = []
for i, sample in enumerate(data):
    sample_ids += [i]*len(sample['input'])

In [179]:
df_data = pd.DataFrame({
    "sample_id": sample_ids,
    "item_id": list_of_items,
    "item_length": item_length,
    "direction": direction_encoding,
    "label": output_encoding
})

In [180]:
item_one_hot = pd.get_dummies(df_data['item_id']).astype(int)

df_data.drop(columns=['item_id'], inplace=True)
df_data = df_data.join(item_one_hot)

In [181]:
label = pd.get_dummies(df_data['label']).astype(int)
df_data.drop(columns=['label'], inplace=True)

In [182]:
scaler = preprocessing.MinMaxScaler()
df_data['item_length'] = df_data['item_length'].astype(float)
df_data['item_length'] = scaler.fit_transform(df_data['item_length'].values.reshape(1, -1)).reshape(-1)

In [183]:
df_data['item_length']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
22581    0.0
22582    0.0
22583    0.0
22584    0.0
22585    0.0
Name: item_length, Length: 22586, dtype: float64

In [184]:
df_data

Unnamed: 0,sample_id,item_length,direction,0,a,b,c,d,e,f,...,o,p,q,r,s,t,u,v,w,x
0,0,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22581,2047,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22582,2047,0.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
22583,2047,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22584,2047,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [185]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers, CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics

In [186]:
sample_id = df_data['sample_id']
df_data.drop(columns=['sample_id'], inplace=True)

In [187]:
X = []
Y = []
for i in range(sample_id.max() + 1):
    X.append(df_data[sample_id == i].values.tolist())
    Y.append(label[sample_id == i].values.tolist())

In [188]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [189]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, make_scorer

In [194]:
X[0][0]

[0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [191]:
crf.fit(X=X, y=Y)

TypeError: expected bytes, list found

In [190]:
pred = cross_val_predict(estimator=crf, X=X, y=Y, cv=5)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'