<a href="https://colab.research.google.com/github/chaeyoonyunakim/smart-2021-AT_Answer_Type_Prediction/blob/main/SMART2021_AT_Prediction_Task_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Environment setup

In [1]:
# mount google drive
from google.colab import drive

# authorization
drive.mount('/content/drive')

# locate dataset folder
%ls -l '/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-rw------- 1 root root 9263124 Sep 29 08:07 '/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json'


In [2]:
# set working directory
import os
os.chdir(path = "/content/drive/My Drive/2021_INM363_SMART/")

In [3]:
# import basics
import pandas as pd
import json
import numpy as np
import pickle
seed = 20211001
import regex as re

In [4]:
# import nlp relevants
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for bag-of-words (bow)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# import scikit-learn tools for modelling and evaluation
from sklearn.model_selection import train_test_split

In [6]:
# import algos
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## 1. Data Loading & Manipulation

***Filter ''o'' in question out***

In [7]:
ap_re = re.compile("(^\')(.*)(\'$)")

def remove_appostrophe(x):
  matches = ap_re.findall(x)
  if len(matches) == 1:
    return matches[0][1]
  else:
    return x

***Load Dataset*** 

In [8]:
def load_data(path, train = True):
  data = pd.read_json(path)
  
  # map na strings to nan
  data.loc[:, "question"].replace("n/a", np.nan, inplace = True)
  
  # drop na in data
  if train:
    data.dropna(subset=['id', 'question', 'category'], inplace=True)
  else:
    data.dropna(subset=['id', 'question'], inplace=True)

  # remove apostrophes from the start and end of str
  data.loc[:, "question"] = data["question"].map(lambda x: remove_appostrophe(x))

  # for the training data remove rows that have no types
  if train:
    data = data[data["type"].map(lambda x : len(x) != 0)]

  return data

In [9]:
# load dataset
train_data2020 = load_data('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json')
test_data2020 = load_data('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_test.json', train = False)

train_data2021 = load_data('/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json')
test_data2021 = load_data('/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_test.json', train = False)

In [10]:
# check data size
train_data2020.shape, test_data2020.shape, train_data2021.shape, test_data2021.shape

((17482, 4), (4378, 4), (36670, 4), (9104, 2))

***Merge two training dataset***

In [11]:
# unify the id format
train_data2020.loc[:, "id"] = train_data2020["id"].map(lambda x: x.split('_')[1])

In [12]:
# concat two dataset
all_data = pd.concat([train_data2020, train_data2021], axis = 0)

In [13]:
# pre processing to remove duplicates in type
# transform array to string
all_data.loc[:, "type_str"] = all_data["type"].map(lambda x: ",".join(x))

In [14]:
# remove duplicates when question, category, and type are same
merged_data = all_data.drop_duplicates(subset=["question", "category", "type_str"])

***Tabular representation of the dataset***

In [15]:
merged_data.head(3)

Unnamed: 0,id,question,category,type,type_str
0,1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean],boolean
1,14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]","dbo:Opera,dbo:MusicalWork,dbo:Work"
2,16615,When did Lena Horne receive the Grammy Award f...,literal,[date],date


In [16]:
merged_data = merged_data.drop(['type_str'], axis =1)
merged_data.head()

Unnamed: 0,id,question,category,type
0,1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."


In [17]:
# check data size
all_data.shape, merged_data.shape

((54152, 5), (39556, 4))

## 2. Preprocessing

***Extract, Transform, Load (ETL)***

In [18]:
class ETL:

###############################################################################
#   Global Variables     
###############################################################################

  # text normalization - stemming, lemmatization, stopwords
  ps = PorterStemmer()
  wordnet_lemmatizer = WordNetLemmatizer() 
  s_words = stopwords.words()

  # set default feature_extraction parameters
  count_vectorizer = None 
  inv_count_vectorizer_vocab = None
  tfidf_vectorizer = None
  inv_tfidf_vectorizer_vocab = None

  # category maps
  category_map = {"boolean": 0, "resource": 1, "literal": 2}
  inv_category_map = {}

  for label, ind in category_map.items():
    inv_category_map[ind] = label

  # literal maps
  literal_map = {"date": 0, "string": 1, "number": 2}
  inv_literal_map = {}

  for label, ind in literal_map.items():
    inv_literal_map[ind] = label  

  # resource maps
  type_maps = {}
  invtype_maps = {}


###############################################################################
#   Main
###############################################################################

  def __init__(self, path_to_type_maps = None, path_to_vectorizers = None):

    # load type maps if requested
    if path_to_type_maps != None:
      base_dir = path_to_type_maps
      paths = [fp for fp in os.listdir(base_dir) if "type" in fp]

      for fp in paths:
        with open(os.path.join(base_dir, fp), "r") as input_file:
          type_name = fp.split("_")[0]
          self.invtype_maps[type_name] = {}
          self.type_maps[type_name] = json.load(input_file)[type_name]
          for ontology, ind in self.type_maps[type_name].items():
            self.invtype_maps[type_name][ind] = ontology

    # load data vectorizers if requested
    if path_to_vectorizers != None:
      base_dir = path_to_vectorizers
      paths = [fp for fp in os.listdir(base_dir) if "vectorizer" in fp]

      for fp in paths:
        vectorizer_name = fp.split("_")[0]
        with open(os.path.join(base_dir, fp), "rb") as input_file:
          if vectorizer_name == "count":
            self.count_vectorizer = pickle.load(input_file)
          elif vectorizer_name == "tfidf":
            self.tfidf_vectorizer = pickle.load(input_file)
          else:
            NotImplementedError


  # split training dataset to exclude validation dataset
  # set train:val = 8:2
  def split_data(self, data, val_size = 0.2):
    df_train, df_test = model_selection.train_test_split(data, test_size = val_size, random_state = seed)
    return df_train, df_test


  # normalization of question sentences
  def _norm_sent(self, sent, rm_stopwords = False, stemming = True, lemmatization = False):
    # tokenize - sentence to word
    words = word_tokenize(sent)
    # take if all characters in the string are alphabets and then decapitalize
    sent = [w.lower() for w in words if w.isalpha()] 

    # remove stopwords
    if rm_stopwords:
      sent = [w for w in sent if w not in self.s_words]    

    # apply lemmatization 
    if lemmatization:
      sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "n") for w in sent]
      sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "v") for w in sent]
      sent = [self.wordnet_lemmatizer.lemmatize(w, pos = ("a")) for w in sent]

    # apply stemming 
    if stemming:
      sent = [self.ps.stem(w) for w in sent]

    sent = " ".join(sent)
    return sent  


  # add a new column to show how question parsing has done through normalization above
  # for Tabular representation of the dataset
  def norm_data(self, data):   
    data.loc[:, "question_processed"] = data["question"].apply(lambda x: self._norm_sent(x, rm_stopwords = False, lemmatization = True, stemming = True))
    return data



  # vectorization - fit vectorizer to training data
  def bow_fit(self, corpus, type = "tf", max_features = 10000, ngram_range = (1,2)):

    if type == "tf":
      self.count_vectorizer = feature_extraction.text.CountVectorizer(max_features = max_features, ngram_range = ngram_range)
      self.count_vectorizer.fit(corpus["question_processed"])

      # create a reverse mapping for the vocab
      self.inv_count_vectorizer_vocab = {}
      for label, ind in self.count_vectorizer.vocabulary_.items():
        self.inv_count_vectorizer_vocab[ind] = label

    elif type == "tfidf": 
      self.tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
      self.tfidf_vectorizer.fit(corpus["question_processed"])
      
      # create a reverse mapping for the vocab
      self.inv_tfidf_vectorizer_vocab = {}
      for label, ind in self.tfidf_vectorizer.vocabulary_.items():
        self.inv_tfidf_vectorizer_vocab[ind] = label

    else:
      return NotImplementedError


  # transformation
  def bow_transform(self, data, type = "tf"):
    if type == "tf":
      return self.count_vectorizer.transform(data["question_processed"])
    elif type == "tfidf":
      return self.tfidf_vectorizer.transform(data["question_processed"])
    else:
      return NotImplementedError


  # category maps
  def category_to_int(self, data):
    return data.category.map(lambda x: self.category_map[x])

  # literal maps
  def literal_to_int(self, data):
    return data.type.map(lambda x: self.literal_map[x[0]])


  # distribute type by ontology class and encode missing if none
  def type_to_int(self, data, type_no):
    return data.type.map(
        lambda x: self.type_maps[f"type{type_no}"][x[type_no - 1]] 
        if len(x) >= type_no 
        else self.type_maps[f"type{type_no}"]["missing"]
        )


  # resource maps
  def add_type_maps(self, train_data, depth = 6, save = True, path = "resource_types/"):

    levels = range(1, depth)
    
    if save:
      os.makedirs(path, exist_ok = True)
  
    for l in levels:
      type_name = f"type{l}"
      self.type_maps[type_name] = {}
      self.invtype_maps[type_name] = {}
      ind = 0
      temp_df = train_data[train_data["category"] == "resource"]["type"].map(lambda x: x[l-1] if len(x) >= l else "missing").to_frame(type_name)
      for ontology in temp_df[type_name]:
        if (ontology not in self.type_maps[type_name]) and (ontology != "missing"):
          self.type_maps[type_name][ontology] = ind 
          self.invtype_maps[type_name][ind] = ontology
          ind += 1
      if save:
        with open(os.path.join("resource_types", f"type{l}_map.json"), "w") as outfile:
          temp_json_obj = json.dump(self.type_maps, outfile)




  # save output
  def save_vectorizers(self, path):

    # make sure directory exists
    os.makedirs(exist_ok= True, name=path)

    if self.count_vectorizer != None:
      with open(os.path.join(path, "count_vectorizer.pkl"), "wb") as count_file:
        pickle.dump(self.count_vectorizer, count_file)
    if self.tfidf_vectorizer != None:
      with open(os.path.join(path, "tfidf_vectorizer.pkl"), "wb") as tfidf_file:
        pickle.dump(self.tfidf_vectorizer, tfidf_file)

In [19]:
etl = ETL()

In [20]:
# split dataset
# for validation 8:2
# df_train, df_val = etl.split_data(merged_data)

In [21]:
# text normalization
# df_train = etl.norm_data(df_train)   # training set without val
df_train = etl.norm_data(merged_data) # training set including val (total)

In [22]:
df_train[['question', 'question_processed']]

Unnamed: 0,question,question_processed
0,Was Jacqueline Kennedy Onassis a follower of M...,wa jacquelin kennedi onassi a follow of melkit...
1,What is the name of the opera based on Twelfth...,what be the name of the opera base on twelfth ...
2,When did Lena Horne receive the Grammy Award f...,when do lena horn receiv the grammi award for ...
3,Do Prince Harry and Prince William have the sa...,do princ harri and princ william have the same...
4,What is the subsidiary company working for Leo...,what be the subsidiari compani work for leonar...
...,...,...
36665,what kinds of music is played by season's end,what kind of music be play by season end
36666,which asteroid group is 6753 fursenko a member...,which asteroid group be fursenko a member of
36667,What language is azhakiya ravanan filmed in?,what languag be azhakiya ravanan film in
36668,which position did herby fortunat play in foot...,which posit do herbi fortunat play in footbal


In [23]:
# vectorization - bag of words model
etl.bow_fit(corpus = df_train, type = "tf")
etl.save_vectorizers(path="sklearn_objects")

## 3. Category Prediction Task

In [24]:
# set category prediction dataset
X_train_category = etl.bow_transform(df_train)
y_train_category = etl.category_to_int(df_train)

In [25]:
# model for category classification
clf_category = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.2, n_jobs = -1, verbose = 2, max_iter = 200)\
    .fit(X_train_category, y_train_category)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


max_iter reached after 867 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 14.4min finished


In [26]:
clf_category.score(X_train_category, y_train_category)

0.9836687228233391

## 4-1. Type Prediction Task - Literal

***3 different model in total: 1 for category, 1 for literal, 1 for resource***

In [27]:
# model for literal classification
# get which rows are for literal only  
train_literal_rows = (df_train["category"] == "literal").values
y_train_literal = etl.literal_to_int(df_train[train_literal_rows])

clf_literal = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.5, n_jobs = -1, verbose = 2, max_iter = 200)\
    .fit(X_train_category[train_literal_rows, :], y_train_literal)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


max_iter reached after 18 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   18.6s finished


In [28]:
clf_literal.score(X_train_category[train_literal_rows], y_train_literal)

0.9790128254955305

## 4-2. Type Prediction Task - Resource



***Identify types of resources in the train data***

In [29]:
etl.add_type_maps(df_train)

In [30]:
etl.type_maps.keys()

dict_keys(['type1', 'type2', 'type3', 'type4', 'type5'])

In [31]:
etl.type_maps.values()

dict_values([{'dbo:Opera': 0, 'dbo:EducationalInstitution': 1, 'dbo:State': 2, 'dbo:Country': 3, 'dbo:AcademicSubject': 4, 'dbo:Name': 5, 'dbo:Organisation': 6, 'dbo:Person': 7, 'dbo:WrittenWork': 8, 'dbo:EthnicGroup': 9, 'dbo:Museum': 10, 'dbo:Writer': 11, 'dbo:AmericanFootballPlayer': 12, 'dbo:ChemicalCompound': 13, 'dbo:OfficeHolder': 14, 'dbo:Magazine': 15, 'dbo:Award': 16, 'dbo:Animal': 17, 'dbo:Activity': 18, 'dbo:NobelPrize': 19, 'dbo:Single': 20, 'dbo:Work': 21, 'dbo:MetroStation': 22, 'dbo:Film': 23, 'dbo:Profession': 24, 'dbo:City': 25, 'dbo:Deity': 26, 'dbo:Galaxy': 27, 'dbo:Company': 28, 'dbo:Gene': 29, 'dbo:Contest': 30, 'dbo:TelevisionEpisode': 31, 'dbo:University': 32, 'dbo:MusicalArtist': 33, 'dbo:Scientist': 34, 'dbo:PoliticalParty': 35, 'dbo:MusicGenre': 36, 'dbo:Disease': 37, 'dbo:Taxon': 38, 'dbo:Book': 39, 'dbo:Settlement': 40, 'dbo:GovernmentAgency': 41, 'dbo:Ship': 42, 'dbo:Drug': 43, 'dbo:Island': 44, 'dbo:Mountain': 45, 'dbo:Village': 46, 'dbo:River': 47, 'dbo:

In [32]:
resource_models = []

for l in range(1, 6):
  # model for resource classification
  # get which rows are for resource only
  # must only include rows that have the type at the level 
  # of classification 
  train_resource_rows = ((df_train["category"] == "resource") & (df_train["type"].map(lambda x: len(x)) >= l)).values
  y_train_type = etl.type_to_int(df_train[train_resource_rows], type_no=l)

  clf_type = MLPClassifier(
      random_state=seed, max_iter=10, hidden_layer_sizes=(1000, 500, 300), verbose = 2).\
    fit(X_train_category[train_resource_rows], y_train_type)

  resource_models.append(clf_type)

Iteration 1, loss = 2.14002143
Iteration 2, loss = 1.03116504
Iteration 3, loss = 0.62297222
Iteration 4, loss = 0.37859177
Iteration 5, loss = 0.25528809
Iteration 6, loss = 0.19085766
Iteration 7, loss = 0.15787654
Iteration 8, loss = 0.13812974
Iteration 9, loss = 0.12472322
Iteration 10, loss = 0.11284379




Iteration 1, loss = 2.08878833
Iteration 2, loss = 0.99633147
Iteration 3, loss = 0.60163255
Iteration 4, loss = 0.38697955
Iteration 5, loss = 0.27508625
Iteration 6, loss = 0.21432184
Iteration 7, loss = 0.18018812
Iteration 8, loss = 0.15607034
Iteration 9, loss = 0.14235186
Iteration 10, loss = 0.13592778




Iteration 1, loss = 2.13147989
Iteration 2, loss = 1.16462734
Iteration 3, loss = 0.77946541
Iteration 4, loss = 0.54746403
Iteration 5, loss = 0.41258115
Iteration 6, loss = 0.32841607
Iteration 7, loss = 0.27502682
Iteration 8, loss = 0.24973910
Iteration 9, loss = 0.22341338
Iteration 10, loss = 0.21124127




Iteration 1, loss = 2.37021883
Iteration 2, loss = 1.34517080
Iteration 3, loss = 0.91403112
Iteration 4, loss = 0.66878606
Iteration 5, loss = 0.50619872
Iteration 6, loss = 0.40519070
Iteration 7, loss = 0.33815534
Iteration 8, loss = 0.30452070
Iteration 9, loss = 0.29280518
Iteration 10, loss = 0.25941069




Iteration 1, loss = 2.05832300
Iteration 2, loss = 1.21112987
Iteration 3, loss = 0.86577162
Iteration 4, loss = 0.62948037
Iteration 5, loss = 0.46834557
Iteration 6, loss = 0.36960250
Iteration 7, loss = 0.32460028
Iteration 8, loss = 0.27729944
Iteration 9, loss = 0.25256042
Iteration 10, loss = 0.23549953




In [33]:
clf_type.score(X_train_category[train_resource_rows], y_train_type) 

0.9422720836510184

## 4. Save models 

In [34]:
with open(os.path.join("sklearn_objects", "category_model.pkl"), "wb") as mdl_file:
  pickle.dump(clf_category, mdl_file)

In [35]:
with open(os.path.join("sklearn_objects", "literal_model.pkl"), "wb") as mdl_file:
  pickle.dump(clf_literal, mdl_file)

In [36]:
for l in range(1,6):
  with open(os.path.join("sklearn_objects", f"resource_level_{l}_model.pkl"), "wb") as mdl_file:
    pickle.dump(resource_models[l-1], mdl_file)

## 5. Results & Evaluation

***Load Pre-trained models***

In [37]:
with open("sklearn_objects/category_model.pkl", "rb") as clf_cat_file:
  clf_category = pickle.load(clf_cat_file)

In [38]:
with open("sklearn_objects/literal_model.pkl", "rb") as clf_lit_file:
  clf_literal = pickle.load(clf_lit_file)

In [39]:
resource_models = []
for l in range(1,6):
  with open(f"sklearn_objects/resource_level_{l}_model.pkl", "rb") as res_mdl:
    resource_models.append(pickle.load(res_mdl))

***Load processing class***

In [40]:
etl = ETL(path_to_type_maps="resource_types", path_to_vectorizers="sklearn_objects")

In [41]:
class ModelEvaluation:
  def __init__(self, etl_inst, cat_model, lit_model, res_models):
    self.etl_inst = etl_inst
    self.cat_model = cat_model
    self.lit_model = lit_model
    self.res_models = res_models
  
  # X is a df
  def get_predictions(self, X, bow_type = "tf"):

    X = X.copy()

    X.reset_index(inplace = True, drop = True)
    
    X_norm = self.etl_inst.norm_data(X)
    X_vec = self.etl_inst.bow_transform(X_norm, type = bow_type)

    bool_int = self.etl_inst.category_map["boolean"]
    literal_int = self.etl_inst.category_map["literal"]
    resource_int = self.etl_inst.category_map["resource"]

    cat_pred = self.cat_model.predict(X_vec)

    ind_bool = cat_pred == bool_int
    ind_literal = cat_pred == literal_int
    ind_resource = cat_pred == resource_int

    if len(ind_bool) > 0:
      X.loc[ind_bool, "cat_prediction"] = "boolean"
      X.loc[ind_bool, "type_prediction"] = pd.Series(
          cat_pred[ind_bool], name = "type_prediction")\
          .map(lambda x: ["boolean"]).values

    if len(ind_literal) > 0:
      X.loc[ind_literal, "cat_prediction"] = "literal"
      literal_pred = self.lit_model.predict(X_vec[ind_literal])
      X.loc[ind_literal, "type_prediction"] = pd.Series(
          literal_pred, name = "type_prediction")\
          .map(lambda x: [self.etl_inst.inv_literal_map[x]]).values

    if len(ind_resource) > 0:
      resource_preds = []
      for ind, type_model in enumerate(self.res_models):
          resource_preds.append(
            pd.Series(
                type_model.predict(X_vec[ind_resource]), name = f"type_{ind}").\
                map(lambda x: self.etl_inst.invtype_maps[f"type{ind+1}"][x])
                )
      resource_preds = pd.Series(pd.concat(resource_preds, axis = 1).values.tolist(), name = "type_prediction")
      X.loc[ind_resource, "type_prediction"] = resource_preds.values
      X.loc[ind_resource, "cat_prediction"] = "resource"
      
      return X

  def output_predictions(self):
    return NotImplementedError

In [42]:
me = ModelEvaluation(etl, clf_category, clf_literal, resource_models)

***Validate***

In [43]:
# out_val = me.get_predictions(df_val)

# true_output = out_val.loc[:, ["id", "question", "category", "type"]]
# true_output_dict = [pred for ind, pred in true_output.to_dict(orient = "index").items()]

# system_output = out_val.loc[:, ["id", "cat_prediction", "type_prediction"]]
# system_output.columns = ["id", "category", "type"]
# system_output_dict = [pred for ind, pred in system_output.to_dict(orient = "index").items()]

***Run Evaluation***

In [44]:
# os.makedirs("system_output/", exist_ok = True)
# with open(os.path.join("system_output", "ground_truth_json.json"), "w") as gfile:
#   json.dump(true_output_dict, gfile)

# with open(os.path.join("system_output", "system_output_json.json"), "w") as sfile:
#   json.dump(system_output_dict, sfile)

In [45]:
# !python evaluate.py --type_hierarchy_tsv dbpedia_types.tsv  \
#  --ground_truth_json system_output/ground_truth_json.json \
#  --system_output_json system_output/system_output_json.json

***Save Test Output***

In [46]:
out_test = me.get_predictions(test_data2021)

#true_output = out_test.loc[:, ["id", "question"]]
#true_output_dict = [pred for ind, pred in true_output.to_dict(orient = "index").items()]

system_output = out_test.loc[:, ["id", "cat_prediction", "type_prediction"]]
system_output.columns = ["id", "category", "type"]
system_output_dict = [pred for ind, pred in system_output.to_dict(orient = "index").items()]

In [47]:
os.makedirs("system_output/", exist_ok = True)
# with open(os.path.join("system_output", "ground_truth_json.json"), "w") as gfile:
#   json.dump(true_output_dict, gfile)

with open(os.path.join("system_output", "system_output_json.json"), "w") as sfile:
  json.dump(system_output_dict, sfile)