<a href="https://colab.research.google.com/github/chaeyoonyunakim/smart-2021-AT_Answer_Type_Prediction/blob/main/SMART2021_AT_Prediction_Task_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Environment setup

In [1]:
# mount google drive
from google.colab import drive

# authorization
drive.mount('/content/drive')

# locate dataset folder
%ls -l '/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json'

Mounted at /content/drive
-rw------- 1 root root 9263124 Sep 29 08:07 '/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json'


In [2]:
# set working directory
import os
os.chdir(path = "/content/drive/My Drive/2021_INM363_SMART/")

In [3]:
# import basics
import pandas as pd
import json
import numpy as np
import pickle
seed = 20211001
import regex as re
import time

In [4]:
# import nlp relevants
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for bag-of-words (bow)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
# import scikit-learn tools for modelling and evaluation
from sklearn.model_selection import train_test_split

In [6]:
# import algos
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## 1. Data Loading & Manipulation

***Filter ''o'' in question out***

In [7]:
ap_re = re.compile("(^\')(.*)(\'$)")

def remove_appostrophe(x):
  matches = ap_re.findall(x)
  if len(matches) == 1:
    return matches[0][1]
  else:
    return x

***Load Dataset*** 

In [8]:
def load_data(path, train = True):
  data = pd.read_json(path)
  
  # map na strings to nan
  data.loc[:, "question"].replace("n/a", np.nan, inplace = True)
  
  # drop na in data
  if train:
    data.dropna(subset=['id', 'question', 'category'], inplace=True)
  else:
    data.dropna(subset=['id', 'question'], inplace=True)

  # remove apostrophes from the start and end of str
  data.loc[:, "question"] = data["question"].map(lambda x: remove_appostrophe(x))

  # for the training data remove rows that have no types
  if train:
    data = data[data["type"].map(lambda x : len(x) != 0)]

  return data

In [9]:
# load dataset
t0 = time.time()
train_data2020 = load_data('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json')
test_data2020 = load_data('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_test.json', train = False)

train_data2021 = load_data('/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json')
test_data2021 = load_data('/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_test.json', train = False)
t1 = time.time()-t0
print(t1)

3.4682600498199463


In [10]:
# check data size
train_data2020.shape, test_data2020.shape, train_data2021.shape, test_data2021.shape

((17482, 4), (4378, 4), (36670, 4), (9104, 2))

In [11]:
train_data2020

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."
...,...,...,...,...
17566,dbpedia_7462,Is the flexural strain at break of the acrylon...,boolean,[boolean]
17567,dbpedia_17610,Where did Hilary Putnam receive their Ph.D.?,resource,"[dbo:University, dbo:EducationalInstitution, d..."
17568,dbpedia_505,Who replaced Charles Evans Hughes as the Chief...,resource,"[dbo:Person, dbo:Agent]"
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N..."


***Merge two training dataset***

In [12]:
# unify the id format
train_data2020.loc[:, "id"] = train_data2020["id"].map(lambda x: x.split('_')[1])

In [13]:
# concat two dataset
all_data = pd.concat([train_data2020, train_data2021], axis = 0)

In [14]:
# pre processing to remove duplicates in type
# transform array to string
all_data.loc[:, "type_str"] = all_data["type"].map(lambda x: ",".join(x))

In [15]:
# remove duplicates when question, category, and type are same
merged_data = all_data.drop_duplicates(subset=["question", "category", "type_str"])

***Tabular representation of the dataset***

In [16]:
merged_data.head(3)

Unnamed: 0,id,question,category,type,type_str
0,1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean],boolean
1,14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]","dbo:Opera,dbo:MusicalWork,dbo:Work"
2,16615,When did Lena Horne receive the Grammy Award f...,literal,[date],date


In [17]:
merged_data = merged_data.drop(['type_str'], axis =1)
merged_data.head()

Unnamed: 0,id,question,category,type
0,1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."


In [18]:
# check data size
all_data.shape, merged_data.shape

((54152, 5), (39556, 4))

## 2. Preprocessing

***Extract, Transform, Load (ETL)***

In [19]:
class ETL:

###############################################################################
#   Global Variables     
###############################################################################

  # text normalization - stemming, lemmatization, stopwords
  ps = PorterStemmer()
  wordnet_lemmatizer = WordNetLemmatizer() 
  lst_stopwords = stopwords.words("english")

  # remove wh-terms from nltk stopwords
  # which is orginally exclusive of 'whose' - so remove other 7 wh-terms only
  wh_list = ['who', 'what', 'when', 'where', 'which', 'whom', 'why', 'how']

  for ele in wh_list:
    lst_stopwords.remove(ele)

  # set default feature_extraction parameters
  count_vectorizer = None 
  inv_count_vectorizer_vocab = None
  tfidf_vectorizer = None
  inv_tfidf_vectorizer_vocab = None

  # category maps
  category_map = {"boolean": 0, "resource": 1, "literal": 2}
  inv_category_map = {}

  for label, ind in category_map.items():
    inv_category_map[ind] = label

  # literal maps
  literal_map = {"date": 0, "string": 1, "number": 2}
  inv_literal_map = {}

  for label, ind in literal_map.items():
    inv_literal_map[ind] = label  

  # resource maps
  type_maps = {}
  invtype_maps = {}


###############################################################################
#   Main
###############################################################################

  def __init__(self, path_to_type_maps = None, path_to_vectorizers = None):

    # load type maps if requested
    if path_to_type_maps != None:
      base_dir = path_to_type_maps
      paths = [fp for fp in os.listdir(base_dir) if "type" in fp]

      for fp in paths:
        with open(os.path.join(base_dir, fp), "r") as input_file:
          type_name = fp.split("_")[0]
          self.invtype_maps[type_name] = {}
          self.type_maps[type_name] = json.load(input_file)[type_name]
          for ontology, ind in self.type_maps[type_name].items():
            self.invtype_maps[type_name][ind] = ontology

    # load data vectorizers if requested
    if path_to_vectorizers != None:
      base_dir = path_to_vectorizers
      paths = [fp for fp in os.listdir(base_dir) if "vectorizer" in fp]

      for fp in paths:
        vectorizer_name = fp.split("_")[0]
        with open(os.path.join(base_dir, fp), "rb") as input_file:
          if vectorizer_name == "count":
            self.count_vectorizer = pickle.load(input_file)
          elif vectorizer_name == "tfidf":
            self.tfidf_vectorizer = pickle.load(input_file)
          else:
            NotImplementedError


  # split training dataset to exclude validation dataset
  # set train:val = 8:2
  def split_data(self, data, val_size = 0.2):
    df_train, df_test = model_selection.train_test_split(data, test_size = val_size, random_state = seed)
    return df_train, df_test


  # normalization of question sentences
  def _norm_sent(self, sent, rm_stopwords = True, stemming = True, lemmatization = True):
    # tokenize - convert from string to list
    words = word_tokenize(sent)

    # convert to lowercase and remove punctuations and symbols
    # take if all characters in the string are alphabets and then decapitalize
    sent = [w.lower() for w in words if w.isalpha()] 

    # remove stopwords
    if rm_stopwords:
      sent = [w for w in sent if w not in self.lst_stopwords]    

    # apply stemming 
    if stemming:
      sent = [self.ps.stem(w) for w in sent]

    # apply lemmatization 
    if lemmatization:
      sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "n") for w in sent]
      sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "v") for w in sent]
      sent = [self.wordnet_lemmatizer.lemmatize(w, pos = ("a")) for w in sent]

    sent = " ".join(sent)
    return sent  


  # add a new column to show how question parsing has done through normalization above
  # for Tabular representation of the dataset
  def norm_data(self, data):   
    data.loc[:, "question_processed"] = data["question"].apply(lambda x: self._norm_sent(x, rm_stopwords = True, lemmatization = True, stemming = True))
    return data



  # vectorization - fit vectorizer to training data
  def bow_fit(self, corpus, type = "tf", max_features = 10000, ngram_range = (1,2)):

    if type == "tf":
      self.count_vectorizer = feature_extraction.text.CountVectorizer(max_features = max_features, ngram_range = ngram_range)
      self.count_vectorizer.fit(corpus["question_processed"])

      # create a reverse mapping for the vocab
      self.inv_count_vectorizer_vocab = {}
      for label, ind in self.count_vectorizer.vocabulary_.items():
        self.inv_count_vectorizer_vocab[ind] = label

    elif type == "tfidf":
      self.tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
      self.tfidf_vectorizer.fit(corpus["question_processed"])
      
      # create a reverse mapping for the vocab
      self.inv_tfidf_vectorizer_vocab = {}
      for label, ind in self.tfidf_vectorizer.vocabulary_.items():
        self.inv_tfidf_vectorizer_vocab[ind] = label

    else:
      return NotImplementedError


  # transformation
  def bow_transform(self, data, type = "tf"):
    if type == "tf":
      return self.count_vectorizer.transform(data["question_processed"])
    elif type == "tfidf":
      return self.tfidf_vectorizer.transform(data["question_processed"])
    else:
      return NotImplementedError


  # category maps
  def category_to_int(self, data):
    return data.category.map(lambda x: self.category_map[x])

  # literal maps
  def literal_to_int(self, data):
    return data.type.map(lambda x: self.literal_map[x[0]])


  # distribute type by ontology class and encode missing if none
  def type_to_int(self, data, type_no):
    return data.type.map(
        lambda x: self.type_maps[f"type{type_no}"][x[type_no - 1]] 
        if len(x) >= type_no 
        else self.type_maps[f"type{type_no}"]["missing"]
        )


  # resource maps
  def add_type_maps(self, train_data, depth = 6, save = True, path = "resource_types/"):

    levels = range(1, depth)
    
    if save:
      os.makedirs(path, exist_ok = True)
  
    for l in levels:
      type_name = f"type{l}"
      self.type_maps[type_name] = {}
      self.invtype_maps[type_name] = {}
      ind = 0
      temp_df = train_data[train_data["category"] == "resource"]["type"].map(lambda x: x[l-1] if len(x) >= l else "missing").to_frame(type_name)
      for ontology in temp_df[type_name]:
        if (ontology not in self.type_maps[type_name]) and (ontology != "missing"):
          self.type_maps[type_name][ontology] = ind 
          self.invtype_maps[type_name][ind] = ontology
          ind += 1
      if save:
        with open(os.path.join("resource_types", f"type{l}_map.json"), "w") as outfile:
          temp_json_obj = json.dump(self.type_maps, outfile)




  # save output
  def save_vectorizers(self, path):

    # make sure directory exists
    os.makedirs(exist_ok= True, name=path)

    if self.count_vectorizer != None:
      with open(os.path.join(path, "count_vectorizer.pkl"), "wb") as count_file:
        pickle.dump(self.count_vectorizer, count_file)
    if self.tfidf_vectorizer != None:
      with open(os.path.join(path, "tfidf_vectorizer.pkl"), "wb") as tfidf_file:
        pickle.dump(self.tfidf_vectorizer, tfidf_file)

In [20]:
etl = ETL()

In [21]:
# split dataset
# for validation 8:2
# df_train, df_val = etl.split_data(merged_data)

In [22]:
# text normalization
# df_train = etl.norm_data(df_train)   # training set without val
df_train = etl.norm_data(merged_data) # training set including val (total)

In [23]:
df_train[['question', 'question_processed']]

Unnamed: 0,question,question_processed
0,Was Jacqueline Kennedy Onassis a follower of M...,jacquelin kennedi onassi follow melkit greek c...
1,What is the name of the opera based on Twelfth...,what name opera base twelfth night
2,When did Lena Horne receive the Grammy Award f...,when lena horn receiv grammi award best jazz v...
3,Do Prince Harry and Prince William have the sa...,princ harri princ william parent
4,What is the subsidiary company working for Leo...,what subsidiari compani work leonard maltin
...,...,...
36665,what kinds of music is played by season's end,what kind music play season end
36666,which asteroid group is 6753 fursenko a member...,which asteroid group fursenko member
36667,What language is azhakiya ravanan filmed in?,what languag azhakiya ravanan film
36668,which position did herby fortunat play in foot...,which posit herbi fortunat play footbal


In [24]:
# vectorization - bag of words model
etl.bow_fit(corpus = df_train, type = "tfidf")

In [25]:
etl.save_vectorizers(path="sklearn_objects")

## 3. Category Prediction Task

In [26]:
# set category prediction dataset
X_train_category = etl.bow_transform(df_train, type = "tfidf")

In [27]:
y_train_category = etl.category_to_int(df_train)

***3 different model in total: 1 for category, 1 for literal, 1 for resource***

In [28]:
# model for category classification
clf_category = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.2, n_jobs = -1, verbose = 2)\
    .fit(X_train_category, y_train_category)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 20 epochs took 72 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.2min finished


In [29]:
clf_category.score(X_train_category, y_train_category)

0.9665031853574679

## 4-1. Type Prediction Task - Literal

In [30]:
# model for literal classification
# get which rows are for literal only  
train_literal_rows = (df_train["category"] == "literal").values
y_train_literal = etl.literal_to_int(df_train[train_literal_rows])

clf_literal = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.5, n_jobs = -1, verbose = 2
    )\
    .fit(X_train_category[train_literal_rows, :], y_train_literal)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 20 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [31]:
clf_literal.score(X_train_category[train_literal_rows], y_train_literal)

0.9508356004663816

## 4-2. Type Prediction Task - Resource



***Identify types of resources in the train data***

In [32]:
etl.add_type_maps(df_train)

In [33]:
resource_models = []

for l in range(1, 6):
  # model for resource classification
  # get which rows are for resource only
  # must only include rows that have the type at the level 
  # of classification 
  train_resource_rows = ((df_train["category"] == "resource") & (df_train["type"].map(lambda x: len(x)) >= l)).values
  y_train_type = etl.type_to_int(df_train[train_resource_rows], type_no=l)

  clf_type = MLPClassifier(
      random_state=seed, max_iter=10, hidden_layer_sizes=(1000, 500, 300)
      , verbose = 2).\
    fit(X_train_category[train_resource_rows], y_train_type)

  resource_models.append(clf_type)

Iteration 1, loss = 2.60073361
Iteration 2, loss = 1.28514271
Iteration 3, loss = 0.84813352
Iteration 4, loss = 0.56595047
Iteration 5, loss = 0.39063045
Iteration 6, loss = 0.27895499
Iteration 7, loss = 0.21972769
Iteration 8, loss = 0.17741711
Iteration 9, loss = 0.15124181
Iteration 10, loss = 0.13678925




Iteration 1, loss = 2.57460801
Iteration 2, loss = 1.24568463
Iteration 3, loss = 0.78632519
Iteration 4, loss = 0.53269140
Iteration 5, loss = 0.38478882
Iteration 6, loss = 0.29064727
Iteration 7, loss = 0.23438973
Iteration 8, loss = 0.19733377
Iteration 9, loss = 0.17495153
Iteration 10, loss = 0.16067453




Iteration 1, loss = 2.55699905
Iteration 2, loss = 1.38819738
Iteration 3, loss = 0.96441829
Iteration 4, loss = 0.68901965
Iteration 5, loss = 0.51262370
Iteration 6, loss = 0.40982330
Iteration 7, loss = 0.34204085
Iteration 8, loss = 0.29101329
Iteration 9, loss = 0.25413398
Iteration 10, loss = 0.23677822




Iteration 1, loss = 2.80221215
Iteration 2, loss = 1.66363582
Iteration 3, loss = 1.19419448
Iteration 4, loss = 0.86773169
Iteration 5, loss = 0.65286595
Iteration 6, loss = 0.52186810
Iteration 7, loss = 0.42829741
Iteration 8, loss = 0.37436630
Iteration 9, loss = 0.32832508
Iteration 10, loss = 0.29308867




Iteration 1, loss = 2.37953779
Iteration 2, loss = 1.49619754
Iteration 3, loss = 1.12289247
Iteration 4, loss = 0.83189347
Iteration 5, loss = 0.64235405
Iteration 6, loss = 0.50297702
Iteration 7, loss = 0.41489779
Iteration 8, loss = 0.34872976
Iteration 9, loss = 0.31019767
Iteration 10, loss = 0.28594132




In [34]:
clf_type.score(X_train_category[train_resource_rows], y_train_type) 

0.9329049123189195

## 4. Save models 

In [35]:
with open(os.path.join("sklearn_objects", "category_model.pkl"), "wb") as mdl_file:
  pickle.dump(clf_category, mdl_file)

In [36]:
with open(os.path.join("sklearn_objects", "literal_model.pkl"), "wb") as mdl_file:
  pickle.dump(clf_literal, mdl_file)

In [37]:
for l in range(1,6):
  with open(os.path.join("sklearn_objects", f"resource_level_{l}_model.pkl"), "wb") as mdl_file:
    pickle.dump(resource_models[l-1], mdl_file)

## 5. Results & Evaluation

***Load Pre-trained models***

In [38]:
with open("sklearn_objects/category_model.pkl", "rb") as clf_cat_file:
  clf_category = pickle.load(clf_cat_file)

In [39]:
with open("sklearn_objects/literal_model.pkl", "rb") as clf_lit_file:
  clf_literal = pickle.load(clf_lit_file)

In [40]:
resource_models = []
for l in range(1,6):
  with open(f"sklearn_objects/resource_level_{l}_model.pkl", "rb") as res_mdl:
    resource_models.append(pickle.load(res_mdl))

***Load processing class***

In [41]:
etl = ETL(path_to_type_maps="resource_types", path_to_vectorizers="sklearn_objects")

In [42]:
class ModelEvaluation:
  def __init__(self, etl_inst, cat_model, lit_model, res_models):
    self.etl_inst = etl_inst
    self.cat_model = cat_model
    self.lit_model = lit_model
    self.res_models = res_models
  
  # X is a df
  def get_predictions(self, X, bow_type = "tf"):

    X = X.copy()

    X.reset_index(inplace = True, drop = True)
    
    X_norm = self.etl_inst.norm_data(X)
    X_vec = self.etl_inst.bow_transform(X_norm, type = bow_type)

    bool_int = self.etl_inst.category_map["boolean"]
    literal_int = self.etl_inst.category_map["literal"]
    resource_int = self.etl_inst.category_map["resource"]

    cat_pred = self.cat_model.predict(X_vec)

    ind_bool = cat_pred == bool_int
    ind_literal = cat_pred == literal_int
    ind_resource = cat_pred == resource_int

    if len(ind_bool) > 0:
      X.loc[ind_bool, "cat_prediction"] = "boolean"
      X.loc[ind_bool, "type_prediction"] = pd.Series(
          cat_pred[ind_bool], name = "type_prediction")\
          .map(lambda x: ["boolean"]).values

    if len(ind_literal) > 0:
      X.loc[ind_literal, "cat_prediction"] = "literal"
      literal_pred = self.lit_model.predict(X_vec[ind_literal])
      X.loc[ind_literal, "type_prediction"] = pd.Series(
          literal_pred, name = "type_prediction")\
          .map(lambda x: [self.etl_inst.inv_literal_map[x]]).values

    if len(ind_resource) > 0:
      resource_preds = []
      for ind, type_model in enumerate(self.res_models):
          resource_preds.append(
            pd.Series(
                type_model.predict(X_vec[ind_resource]), name = f"type_{ind}").\
                map(lambda x: self.etl_inst.invtype_maps[f"type{ind+1}"][x])
                )
      resource_preds = pd.Series(pd.concat(resource_preds, axis = 1).values.tolist(), name = "type_prediction")
      X.loc[ind_resource, "type_prediction"] = resource_preds.values
      X.loc[ind_resource, "cat_prediction"] = "resource"
      
      return X

  def output_predictions(self):
    return NotImplementedError

In [43]:
me = ModelEvaluation(etl, clf_category, clf_literal, resource_models)

***Validate***

In [44]:
# out_val = me.get_predictions(df_val, bow_type = 'tfidf')

# true_output = out_val.loc[:, ["id", "question", "category", "type"]]
# true_output_dict = [pred for ind, pred in true_output.to_dict(orient = "index").items()]

# system_output = out_val.loc[:, ["id", "cat_prediction", "type_prediction"]]
# system_output.columns = ["id", "category", "type"]
# system_output_dict = [pred for ind, pred in system_output.to_dict(orient = "index").items()]

***Run Evaluation***

In [45]:
# os.makedirs("system_output/", exist_ok = True)
# with open(os.path.join("system_output", "ground_truth_json.json"), "w") as gfile:
#   json.dump(true_output_dict, gfile)

# with open(os.path.join("system_output", "system_output_json.json"), "w") as sfile:
#   json.dump(system_output_dict, sfile)

In [46]:
# !python evaluate.py --type_hierarchy_tsv dbpedia_types.tsv  \
#  --ground_truth_json system_output/ground_truth_json.json \
#  --system_output_json system_output/system_output_json.json

***Save Test Output***

In [47]:
out_test = me.get_predictions(test_data2021, bow_type = 'tfidf')

#true_output = out_test.loc[:, ["id", "question"]]
#true_output_dict = [pred for ind, pred in true_output.to_dict(orient = "index").items()]

system_output = out_test.loc[:, ["id", "cat_prediction", "type_prediction"]]
system_output.columns = ["id", "category", "type"]
system_output_dict = [pred for ind, pred in system_output.to_dict(orient = "index").items()]

In [48]:
os.makedirs("system_output/", exist_ok = True)
# with open(os.path.join("system_output", "ground_truth_json.json"), "w") as gfile:
#   json.dump(true_output_dict, gfile)

with open(os.path.join("system_output", "system_output_json.json"), "w") as sfile:
  json.dump(system_output_dict, sfile)