<a href="https://colab.research.google.com/github/chaeyoonyunakim/smart-2021-AT_Answer_Type_Prediction/blob/main/SMART2021_AT_Prediction_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Environment setup

In [1]:
# mount google drive
from google.colab import drive

# authorization
drive.mount('/content/drive')

# locate dataset folder
%ls -l '/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-rw------- 1 root root 9897011 Aug  3 21:52 '/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json'


In [2]:
# set working directory
import os
os.chdir(path = "/content/drive/My Drive/2021_INM363_SMART/")

In [3]:
# import basics
import pandas as pd
import json
import numpy as np
import pickle
seed = 20211001

In [4]:
# import nlp relevants
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for bag-of-words (bow)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# import scikit-learn tools for modelling and evaluation
from sklearn.model_selection import train_test_split

In [6]:
# import algos
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## 1. Data Loading & Manipulation

***Tabular representation of the dataset***

In [7]:
# load dataset
train_data = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_train.json')
test_data = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/task1_dbpedia_test.json')

# drop na in training, test dataset
train_data.dropna(subset=['id', 'question', 'category'], inplace=True)
test_data.dropna(subset=['id', 'question'], inplace=True)

# drop resources with no types 
train_data = train_data[train_data["type"].map(lambda x : len(x) != 0)]

# check the format of tabluar representation
train_data

Unnamed: 0,id,question,category,type
0,0,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,1,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,2,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,3,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,5,Which is the hierarchical BrainInfo ID of the ...,literal,[string]
...,...,...,...,...
40616,50708,what kinds of music is played by season's end,resource,"[dbo:Genre, dbo:TopicalConcept, dbo:MusicGenre]"
40617,50709,which asteroid group is 6753 fursenko a member...,resource,[dbo:Album]
40618,50710,What language is azhakiya ravanan filmed in?,resource,[dbo:Language]
40619,50712,which position did herby fortunat play in foot...,resource,[dbo:Person]


## 1. Preprocessing

***Extract, Transform, Load (ETL)***

In [8]:
class ETL:

###############################################################################
#   Global Variables     
###############################################################################

  # text normalization - stemming, lemmatization, stopwords
  ps = PorterStemmer()
  wordnet_lemmatizer = WordNetLemmatizer() 
  s_words = stopwords.words()

  # set default feature_extraction parameters
  count_vectorizer = None 
  inv_count_vectorizer_vocab = None
  tfidf_vectorizer = None
  inv_tfidf_vectorizer_vocab = None

  # category maps
  category_map = {"boolean": 0, "resource": 1, "literal": 2}
  inv_category_map = {}

  for label, ind in category_map.items():
    inv_category_map[ind] = label

  # literal maps
  literal_map = {"date": 0, "string": 1, "number": 2}
  inv_literal_map = {}

  for label, ind in literal_map.items():
    inv_literal_map[ind] = label  

  # resource maps
  type_maps = {}
  invtype_maps = {}


###############################################################################
#   Main
###############################################################################

  def __init__(self, path_to_type_maps = None, path_to_vectorizers = None):

    # load type maps if requested
    if path_to_type_maps != None:
      base_dir = path_to_type_maps
      paths = [fp for fp in os.listdir(base_dir) if "type" in fp]

      for fp in paths:
        with open(os.path.join(base_dir, fp), "r") as input_file:
          type_name = fp.split("_")[0]
          self.invtype_maps[type_name] = {}
          self.type_maps[type_name] = json.load(input_file)[type_name]
          for ontology, ind in self.type_maps[type_name].items():
            self.invtype_maps[type_name][ind] = ontology

    # load data vectorizers if requested
    if path_to_vectorizers != None:
      base_dir = path_to_vectorizers
      paths = [fp for fp in os.listdir(base_dir) if "vectorizer" in fp]

      for fp in paths:
        vectorizer_name = fp.split("_")[0]
        with open(os.path.join(base_dir, fp), "rb") as input_file:
          if vectorizer_name == "count":
            self.count_vectorizer = pickle.load(input_file)
          elif vectorizer_name == "tfidf":
            self.tfidf_vectorizer = pickle.load(input_file)
          else:
            NotImplementedError


  # split training dataset to exclude validation dataset
  # set train:val = 8:2
  def split_data(self, data, val_size = 0.2):
    df_train, df_test = model_selection.train_test_split(data, test_size = val_size, random_state = seed)
    return df_train, df_test


  # normalization of question sentences
  def _norm_sent(self, sent, rm_stopwords = True, stemming = True, lemmatization = True):
    # tokenize - sentence to word
    words = word_tokenize(sent)
    # take if all characters in the string are alphabets and then decapitalize
    sent = [w.lower() for w in words if w.isalpha()] 

    # remove stopwords
    if rm_stopwords:
      sent = [w for w in sent if w not in self.s_words]

    # apply stemming 
    if stemming:
      sent = [self.ps.stem(w) for w in sent]
    
    # apply lemmatization 
    if lemmatization:
      sent = [self.wordnet_lemmatizer.lemmatize(w) for w in sent]

    sent = " ".join(sent)
    return sent  


  # add a new column to show how question parsing has done through normalization above
  # for Tabular representation of the dataset
  def norm_data(self, data):   
    data.loc[:, "question_processed"] = data["question"].apply(lambda x: self._norm_sent(x, rm_stopwords = False, lemmatization = True, stemming = False))
    return data



  # vectorization - fit vectorizer to training data
  def bow_fit(self, corpus, type = "tf", max_features = 10000, ngram_range = (1,2)):

    if type == "tf":
      self.count_vectorizer = feature_extraction.text.CountVectorizer(max_features = max_features, ngram_range = ngram_range)
      self.count_vectorizer.fit(corpus["question_processed"])

      # create a reverse mapping for the vocab
      self.inv_count_vectorizer_vocab = {}
      for label, ind in self.count_vectorizer.vocabulary_.items():
        self.inv_count_vectorizer_vocab[ind] = label

    elif type == "tfidf": 
      self.tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
      self.tfidf_vectorizer.fit(corpus["question_processed"])
      
      # create a reverse mapping for the vocab
      self.inv_tfidf_vectorizer_vocab = {}
      for label, ind in self.tfidf_vectorizer.vocabulary_.items():
        self.inv_tfidf_vectorizer_vocab[ind] = label

    else:
      return NotImplementedError


  # transformation
  def bow_transform(self, data, type = "tf"):
    if type == "tf":
      return self.count_vectorizer.transform(data["question_processed"])
    elif type == "tfidf":
      return self.tfidf_vectorizer.transform(data["question_processed"])
    else:
      return NotImplementedError


  # category maps
  def category_to_int(self, data):
    return data.category.map(lambda x: self.category_map[x])

  # literal maps
  def literal_to_int(self, data):
    return data.type.map(lambda x: self.literal_map[x[0]])


  # distribute type by ontology class and encode missing if none
  def type_to_int(self, data, type_no):
    return data.type.map(
        lambda x: self.type_maps[f"type{type_no}"][x[type_no - 1]] 
        if len(x) >= type_no 
        else self.type_maps[f"type{type_no}"]["missing"]
        )


  # resource maps
  def add_type_maps(self, train_data, depth = 6, save = True, path = "resource_types/"):

    levels = range(1, depth)
    
    if save:
      os.makedirs(path, exist_ok = True)
  
    for l in levels:
      type_name = f"type{l}"
      self.type_maps[type_name] = {}
      self.invtype_maps[type_name] = {}
      ind = 0
      temp_df = train_data[train_data["category"] == "resource"]["type"].map(lambda x: x[l-1] if len(x) >= l else "missing").to_frame(type_name)
      for ontology in temp_df[type_name]:
        if (ontology not in self.type_maps[type_name]) and (ontology != "missing"):
          self.type_maps[type_name][ontology] = ind 
          self.invtype_maps[type_name][ind] = ontology
          ind += 1
      if save:
        with open(os.path.join("resource_types", f"type{l}_map.json"), "w") as outfile:
          temp_json_obj = json.dump(self.type_maps, outfile)




  # save output
  def save_vectorizers(self, path):

    # make sure directory exists
    os.makedirs(exist_ok= True, name=path)

    if self.count_vectorizer != None:
      with open(os.path.join(path, "count_vectorizer.pkl"), "wb") as count_file:
        pickle.dump(self.count_vectorizer, count_file)
    if self.tfidf_vectorizer != None:
      with open(os.path.join(path, "tfidf_vectorizer.pkl"), "wb") as tfidf_file:
        pickle.dump(self.tfidf_vectorizer, tfidf_file)

In [9]:
etl = ETL()

# split dataset
df_train, df_val = etl.split_data(train_data)

# text normalization
df_train = etl.norm_data(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [10]:
df_train

Unnamed: 0,id,question,category,type,question_processed
14046,17454,What is the namesake of nobelium used by the d...,resource,"[dbo:Person, dbo:Agent]",what is the namesake of nobelium used by the d...
21427,26717,What is the nationality of emilio comte?,resource,"[dbo:Person, dbo:Place, dbo:Location, dbo:Coun...",what is the nationality of emilio comte
28996,36200,which country was sharon waxman born in,resource,"[dbo:Organisation, dbo:Country, dbo:Person, db...",which country wa sharon waxman born in
38039,47512,What type of music is bobby kildea known for,resource,"[dbo:Genre, dbo:TopicalConcept, dbo:MusicGenre]",what type of music is bobby kildea known for
5748,7189,Is it true that Roman numerals use the Roman a...,boolean,[boolean],is it true that roman numeral use the roman al...
...,...,...,...,...,...
27253,34020,what country made the film gymnoi sto dromo,resource,"[dbo:Organisation, dbo:Agent, dbo:Place, dbo:L...",what country made the film gymnoi sto dromo
7194,8987,How many inventions were fone by Sagittarius A ?,literal,[number],how many invention were fone by sagittarius a
34803,43455,whats was the birthplace of james smith (deleg...,resource,"[dbo:Place, dbo:Location, dbo:Settlement, dbo:...",whats wa the birthplace of james smith delegate
27325,34112,Name a baseball player,resource,"[dbo:Person, dbo:Athlete, dbo:Agent, dbo:Baseb...",name a baseball player


In [11]:
df_val

Unnamed: 0,id,question,category,type
692,866,What is the medical speciality of the cause of...,resource,[dbo:MedicalSpecialty]
30990,38676,where in the middle east is mohammad-reza hona...,resource,"[dbo:Place, dbo:Location, dbo:City, dbo:Settle..."
32692,40840,which famous person was born in armenia,resource,"[dbo:Person, dbo:Politician, dbo:Agent]"
22765,28411,What is the second level division of douglas c...,resource,"[dbo:Organisation, dbo:Country, dbo:Person, db..."
9646,12041,Which part of Adélie Land has a the second-lev...,resource,"[dbo:Country, dbo:State, dbo:PopulatedPlace, d..."
...,...,...,...,...
12831,15961,How many books have been penned by Miguel de C...,literal,[number]
20445,25491,which country was michael jackson born,resource,"[dbo:Organisation, dbo:Country, dbo:Person, db..."
2858,3565,Which is Academia Brasileira de Letras ID for ...,literal,[string]
958,1198,Name the city with leader as Esther Alder and ...,resource,"[dbo:Settlement, dbo:PopulatedPlace, dbo:Place..."


In [12]:
# vectorization - bag of words model
etl.bow_fit(corpus = df_train, type = "tf")
etl.save_vectorizers(path="sklearn_objects")

## 2. Category Prediction Task

In [13]:
# set category prediction dataset
X_train_category = etl.bow_transform(df_train)
y_train_category = etl.category_to_int(df_train)

In [14]:
# model for category classification
clf_category = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.2, n_jobs = -1, verbose = 2)\
    .fit(X_train_category, y_train_category)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


max_iter reached after 277 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  4.6min finished


In [15]:
clf_category.score(X_train_category, y_train_category)

0.9865444883499115

## 3-1. Type Prediction Task - Literal

In [16]:
# model for literal classification
# get which rows are for literal only  
train_literal_rows = (df_train["category"] == "literal").values
y_train_literal = etl.literal_to_int(df_train[train_literal_rows])

clf_literal = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.5, n_jobs = -1, verbose = 2
    )\
    .fit(X_train_category[train_literal_rows, :], y_train_literal)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


max_iter reached after 5 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.4s finished


In [17]:
clf_literal.score(X_train_category[train_literal_rows], y_train_literal)

0.9714879714879715

## 3-2. Type Prediction Task - Resource



***Identify types of resources in the train data***

In [18]:
etl.add_type_maps(df_train)

In [19]:
resource_models = []

for l in range(1, 6):
  # model for resource classification
  # get which rows are for resource only
  # must only include rows that have the type at the level 
  # of classification 
  train_resource_rows = ((df_train["category"] == "resource") & (df_train["type"].map(lambda x: len(x)) >= l)).values
  y_train_type = etl.type_to_int(df_train[train_resource_rows], type_no=l)

  clf_type = MLPClassifier(
      random_state=seed, max_iter=10, hidden_layer_sizes=(1000, 500, 300)
      , verbose = 2).\
    fit(X_train_category[train_resource_rows], y_train_type)

  resource_models.append(clf_type)

Iteration 1, loss = 2.29953966
Iteration 2, loss = 1.09426166
Iteration 3, loss = 0.67409338
Iteration 4, loss = 0.41182201
Iteration 5, loss = 0.26817618
Iteration 6, loss = 0.20943149
Iteration 7, loss = 0.16555311
Iteration 8, loss = 0.14239842
Iteration 9, loss = 0.12992806
Iteration 10, loss = 0.11503049




Iteration 1, loss = 2.28284059
Iteration 2, loss = 1.08386417
Iteration 3, loss = 0.64587422
Iteration 4, loss = 0.40470246
Iteration 5, loss = 0.28361956
Iteration 6, loss = 0.22556878
Iteration 7, loss = 0.18703645
Iteration 8, loss = 0.16422379
Iteration 9, loss = 0.14626329
Iteration 10, loss = 0.13530534




Iteration 1, loss = 2.32074891
Iteration 2, loss = 1.23872710
Iteration 3, loss = 0.83656639
Iteration 4, loss = 0.57757583
Iteration 5, loss = 0.43190912
Iteration 6, loss = 0.35050438
Iteration 7, loss = 0.29367616
Iteration 8, loss = 0.26347301
Iteration 9, loss = 0.23719232
Iteration 10, loss = 0.21795415




Iteration 1, loss = 2.54329641
Iteration 2, loss = 1.48033644
Iteration 3, loss = 1.02821009
Iteration 4, loss = 0.72233643
Iteration 5, loss = 0.55701616
Iteration 6, loss = 0.43769394
Iteration 7, loss = 0.36431524
Iteration 8, loss = 0.32122642
Iteration 9, loss = 0.29553595
Iteration 10, loss = 0.27140162




Iteration 1, loss = 2.23569564
Iteration 2, loss = 1.31071397
Iteration 3, loss = 0.97149563
Iteration 4, loss = 0.71956873
Iteration 5, loss = 0.55829689
Iteration 6, loss = 0.45505679
Iteration 7, loss = 0.36935108
Iteration 8, loss = 0.31848114
Iteration 9, loss = 0.28369004
Iteration 10, loss = 0.27001745




In [20]:
clf_type.score(X_train_category[train_resource_rows], y_train_type) 

0.9355706598697253

## 4. Save models 

In [21]:
with open(os.path.join("sklearn_objects", "category_model.pkl"), "wb") as mdl_file:
  pickle.dump(clf_category, mdl_file)

In [22]:
with open(os.path.join("sklearn_objects", "literal_model.pkl"), "wb") as mdl_file:
  pickle.dump(clf_literal, mdl_file)

In [23]:
for l in range(1,6):
  with open(os.path.join("sklearn_objects", f"resource_level_{l}_model.pkl"), "wb") as mdl_file:
    pickle.dump(resource_models[l-1], mdl_file)

## 5. Results & Evaluation

***Load Pre-trained models***

In [24]:
with open("sklearn_objects/category_model.pkl", "rb") as clf_cat_file:
  clf_category = pickle.load(clf_cat_file)

In [25]:
with open("sklearn_objects/literal_model.pkl", "rb") as clf_lit_file:
  clf_literal = pickle.load(clf_lit_file)

In [26]:
resource_models = []
for l in range(1,6):
  with open(f"sklearn_objects/resource_level_{l}_model.pkl", "rb") as res_mdl:
    resource_models.append(pickle.load(res_mdl))

***Load processing class***

In [27]:
etl = ETL(path_to_type_maps="resource_types", path_to_vectorizers="sklearn_objects")

In [28]:
class ModelEvaluation:
  def __init__(self, etl_inst, cat_model, lit_model, res_models):
    self.etl_inst = etl_inst
    self.cat_model = cat_model
    self.lit_model = lit_model
    self.res_models = res_models
  
  # X is a df
  def get_predictions(self, X, bow_type = "tf"):

    X = X.copy()

    X.reset_index(inplace = True, drop = True)
    
    X_norm = self.etl_inst.norm_data(X)
    X_vec = self.etl_inst.bow_transform(X_norm, type = bow_type)

    bool_int = self.etl_inst.category_map["boolean"]
    literal_int = self.etl_inst.category_map["literal"]
    resource_int = self.etl_inst.category_map["resource"]

    cat_pred = self.cat_model.predict(X_vec)

    ind_bool = cat_pred == bool_int
    ind_literal = cat_pred == literal_int
    ind_resource = cat_pred == resource_int

    if len(ind_bool) > 0:
      X.loc[ind_bool, "cat_prediction"] = "boolean"
      X.loc[ind_bool, "type_prediction"] = pd.Series(
          cat_pred[ind_bool], name = "type_prediction")\
          .map(lambda x: ["boolean"]).values

    if len(ind_literal) > 0:
      X.loc[ind_literal, "cat_prediction"] = "literal"
      literal_pred = self.lit_model.predict(X_vec[ind_literal])
      X.loc[ind_literal, "type_prediction"] = pd.Series(
          literal_pred, name = "type_prediction")\
          .map(lambda x: [self.etl_inst.inv_literal_map[x]]).values

    if len(ind_resource) > 0:
      resource_preds = []
      for ind, type_model in enumerate(self.res_models):
          resource_preds.append(
            pd.Series(
                type_model.predict(X_vec[ind_resource]), name = f"type_{ind}").\
                map(lambda x: self.etl_inst.invtype_maps[f"type{ind+1}"][x])
                )
      resource_preds = pd.Series(pd.concat(resource_preds, axis = 1).values.tolist(), name = "type_prediction")
      X.loc[ind_resource, "type_prediction"] = resource_preds.values
      X.loc[ind_resource, "cat_prediction"] = "resource"
      
      return X

  def output_predictions(self):
    return NotImplementedError

In [29]:
me = ModelEvaluation(etl, clf_category, clf_literal, resource_models)

***Validate***

In [37]:
out_val = me.get_predictions(df_val)

true_output = out_val.loc[:, ["id", "question", "category", "type"]]
true_output_dict = [pred for ind, pred in true_output.to_dict(orient = "id").items()]

system_output = out_val.loc[:, ["id", "cat_prediction", "type_prediction"]]
system_output.columns = ["id", "category", "type"]
system_output_dict = [pred for ind, pred in system_output.to_dict(orient = "id").items()]



***Run Evaluation***

In [38]:
os.makedirs("system_output/", exist_ok = True)
with open(os.path.join("system_output", "ground_truth_json.json"), "w") as gfile:
  json.dump(true_output_dict, gfile)

with open(os.path.join("system_output", "system_output_json.json"), "w") as sfile:
  json.dump(system_output_dict, sfile)

In [39]:
!python evaluate.py --type_hierarchy_tsv dbpedia_types.tsv  \
 --ground_truth_json system_output/ground_truth_json.json \
 --system_output_json system_output/system_output_json.json

Loading type hierarchy from dbpedia_types.tsv... 761 types loaded (max depth: 7)
Loading ground truth from system_output/ground_truth_json.json... 
   7340 questions loaded
Loading system predictions from system_output/system_output_json.json... 
   7340 predictions loaded


Evaluation results:
-------------------
Category prediction (based on 7340 questions)
  Accuracy: 0.973
Type ranking (based on 7340 questions)
  NDCG@5:  0.732
  NDCG@10: 0.649


***Test***

In [40]:
out_test = me.get_predictions(test_data)

#true_output = out_test.loc[:, ["id", "question"]]
#true_output_dict = [pred for ind, pred in true_output.to_dict(orient = "index").items()]

system_output = out_test.loc[:, ["id", "cat_prediction", "type_prediction"]]
system_output.columns = ["id", "category", "type"]
system_output_dict = [pred for ind, pred in system_output.to_dict(orient = "index").items()]

In [41]:
os.makedirs("system_output/", exist_ok = True)
# with open(os.path.join("system_output", "ground_truth_json.json"), "w") as gfile:
#   json.dump(true_output_dict, gfile)

with open(os.path.join("system_output", "system_output_json.json"), "w") as sfile:
  json.dump(system_output_dict, sfile)