## 0. Environment setup

In [1]:
# import basics
import pandas as pd
import json
import numpy as np
import os
import pickle
seed = 20211001

In [2]:
# import nlp relevants
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for bag-of-words (bow)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# import scikit-learn tools for modelling and evaluation
from sklearn.model_selection import train_test_split

In [4]:
# import algos
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## 1. Data Loading & Manipulation

***Tabular representation of the dataset***

In [5]:
# load dataset
train_data = pd.read_json('C:/Users/chaey/INM713-python-main/smart-2021-dataset-main/smart2021-AT_Answer_Type_Prediction/dbpedia/task1_dbpedia_train.json')
test_data = pd.read_json('C:/Users/chaey/INM713-python-main/smart-2021-dataset-main/smart2021-AT_Answer_Type_Prediction/dbpedia/task1_dbpedia_test.json')

# drop na in training dataset
train_data.dropna(subset=['id', 'question', 'category'], inplace=True)

# check the format of tabluar representation
train_data

Unnamed: 0,id,question,category,type
0,0,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,1,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,2,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,3,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,5,Which is the hierarchical BrainInfo ID of the ...,literal,[string]
...,...,...,...,...
40616,50708,what kinds of music is played by season's end,resource,"[dbo:Genre, dbo:TopicalConcept, dbo:MusicGenre]"
40617,50709,which asteroid group is 6753 fursenko a member...,resource,[dbo:Album]
40618,50710,What language is azhakiya ravanan filmed in?,resource,[dbo:Language]
40619,50712,which position did herby fortunat play in foot...,resource,[dbo:Person]


## 1. Preprocessing

***Extract, Transform, Load (ETL)***

In [6]:
class ETL:

    # split training dataset to exclude validation dataset
    # set train:val = 8:2
    def split_data(self, data, val_size = 0.2):
        df_train, df_test = model_selection.train_test_split(data, test_size = val_size, random_state = seed)
        return df_train, df_test


    # text normalization - stemming, lemmatization, stopwords
    ps = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer() 
    s_words = stopwords.words()

    # normalization of question sentences
    def _norm_sent(self, sent, rm_stopwords = True, stemming = True, lemmatization = True):
        # tokenize - sentence to word
        words = word_tokenize(sent)
        # take if all characters in the string are alphabets and then decapitalize
        sent = [w.lower() for w in words if w.isalpha()] 

        # remove stopwords
        if rm_stopwords:
            sent = [w for w in sent if w not in self.s_words]

        # apply stemming 
        if stemming:
            sent = [self.ps.stem(w) for w in sent]

        # apply lemmatization 
        if lemmatization:
            sent = [self.wordnet_lemmatizer.lemmatize(w) for w in sent]

        sent = " ".join(sent)
        return sent  
  
  # add a new column to show how question parsing has done through normalization above
  # for Tabular representation of the dataset
    def norm_data(self, data):   
        data.loc[:, "question_processed"] = data["question"].apply(lambda x: self._norm_sent(x, rm_stopwords = False, lemmatization = True, stemming = False))
        return data



    # transform a given text into a vector
    def __init__(self, path_to_type_maps = None, path_to_vectorizers = None):

        # load type maps if requested
        self.type_maps = {}
        self.inv_type_maps = {}
        if path_to_type_maps != None:
            paths = [fp for fp in os.listdir("./") if "type" in fp]
            for fp in paths:
                type_name = fp.split("_")[0]
                with open(fp, "r") as input_file:
                    self.type_maps[type_name] = json.load(input_file)
                    self.inv_type_maps[type_name] = {}
                    for ontology, ind in self.type_maps[type_name].items():
                        self.inv_type_maps[type_name][ind] = ontology 

        # load data vectorizers if requested
        if path_to_vectorizers != None:
            paths = [fp for fp in os.listdir("./") if "vectorizer" in fp]
            for fp in paths:
                vectorizer_name = fp.split("_")[0]
                with open(fp, "rb") as input_file:
                    if vectorizer_name == "count":
                        self.count_vectorizer = pickle.load(input_file)
                    elif vectorizer_name == "tfidf":
                        self.tfidf_vectorizer = pickle.load(input_file)
                    else:
                        NotImplementedError



    # set default feature_extraction parameters
    count_vectorizer = None 
    inv_count_vectorizer_vocab = None
    tfidf_vectorizer = None
    inv_tfidf_vectorizer_vocab = None

    # vectorization
    def bow_fit(self, corpus, type = "tf", max_features = 10000, ngram_range = (1,2)):

        if type == "tf":
            self.count_vectorizer = feature_extraction.text.CountVectorizer(max_features = max_features, ngram_range = ngram_range)
            self.count_vectorizer.fit(corpus["question_processed"])

            # create a reverse mapping for the vocab
            self.inv_count_vectorizer_vocab = {}
            for label, ind in self.count_vectorizer.vocabulary_.items():
                self.inv_count_vectorizer_vocab[ind] = label

        elif type == "tfidf": 
            self.tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
            self.tfidf_vectorizer.fit(corpus["question_processed"])

            # create a reverse mapping for the vocab
            self.inv_tfidf_vectorizer_vocab = {}
            for label, ind in self.tfidf_vectorizer.vocabulary_.items():
                self.inv_tfidf_vectorizer_vocab[ind] = label

        else:
            return NotImplementedError



    # transformation
    def bow_transform(self, data, type = "tf"):
        if type == "tf":
            return self.count_vectorizer.transform(data["question_processed"])
        elif type == "tfidf":
            return self.tfidf_vectorizer.transform(data["question_processed"])
        else:
            return NotImplementedError


    # mapping category column's value to integer
    category_map = {"boolean": 0, "resource": 1, "literal": 2}
    inv_category_map = {}

    for label, ind in category_map.items():
        inv_category_map[ind] = label

    def category_to_int(self, data):
        return data.category.map(lambda x: self.category_map[x])


    # mapping type-literal value to integer
    literal_map = {"date": 0, "string": 1, "number": 2}
    inv_literal_map = {}

    for label, ind in literal_map.items():
        inv_literal_map[ind] = label

    def literal_to_int(self, data):
        return data.type.map(lambda x: self.literal_map[x[0]])


    # distribute type by ontology class and encode missing if none
    def type_to_int(self, data, type_no):
        return data.type.map(
            lambda x: self.type_maps[f"type{type_no}"][x[type_no - 1]] 
            if len(x) >= type_no 
            else self.type_maps[f"type{type_no}"]["missing"]
            )


    # save output
    def save_vectorizers(self):
        if self.count_vectorizer != None:
            with open("count_vectorizer.pkl", "wb") as count_file:
                pickle.dump(self.count_vectorizer, count_file)
        if self.tfidf_vectorizer != None:
            with open("tfidf_vectorizer.pkl", "wb") as tfidf_file:
                pickle.dump(self.tfidf_vectorizer, tfidf_file)

In [7]:
etl = ETL(path_to_type_maps="./", path_to_vectorizers="./")

# split dataset
df_train, df_val = etl.split_data(train_data)

# text normalization
df_train = etl.norm_data(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [8]:
df_train

Unnamed: 0,id,question,category,type,question_processed
23782,29658,What film did zhang yimou direct?,resource,"[dbo:Film, dbo:Work]",what film did zhang yimou direct
18007,22426,who is the governor of hawaii now,resource,"[dbo:Person, dbo:Politician, dbo:Agent, dbo:Of...",who is the governor of hawaii now
19818,24691,where was rihanna born and raised,resource,"[dbo:Place, dbo:Location, dbo:Settlement, dbo:...",where wa rihanna born and raised
10011,12483,Which is the authority for the congress of the...,resource,"[dbo:Meeting, dbo:SocietalEvent, dbo:Event]",which is the authority for the congress of the...
23520,29329,in what german city did hermann beckh die,resource,"[dbo:Place, dbo:Location, dbo:Settlement, dbo:...",in what german city did hermann beckh die
...,...,...,...,...,...
26865,33519,what is a lower classification of bear,resource,"[dbo:Mammal, dbo:Mammal, dbo:Animal, dbo:Insec...",what is a lower classification of bear
7188,8979,What is the natural abundance of helium-3?,literal,[number],what is the natural abundance of
34415,42977,who is sanjay khan's child?,resource,"[dbo:Person, dbo:Agent]",who is sanjay khan child
26937,33609,what type of music does does it look like im h...,resource,"[dbo:Genre, dbo:TopicalConcept, dbo:MusicGenre]",what type of music doe doe it look like im her...


In [9]:
df_val

Unnamed: 0,id,question,category,type
22014,27463,What type of object is 3809 amici,resource,[dbo:Planet]
18208,22675,what college did matt schaub play for,resource,[]
37635,47013,What country made horses of god,resource,"[dbo:Place, dbo:Location, dbo:ArchitecturalStr..."
23306,29074,What company produced the film changeling?,resource,"[dbo:Company, dbo:Film, dbo:Organisation, dbo:..."
1802,2251,The Maurya Empire covered which modern-day cou...,resource,"[dbo:Person, dbo:Agent]"
...,...,...,...,...
18340,22838,where is mallorca,resource,"[dbo:Place, dbo:Location, dbo:Sea, dbo:BodyOfW..."
13342,16578,Count the number of first drivers in all the G...,literal,[number]
13426,16685,Is the individual tax rate in Sweden 25%?,boolean,[boolean]
26658,33265,who wrote a wizard abroad?,resource,"[dbo:Person, dbo:Writer, dbo:Agent]"


In [10]:
# vectorization - bag of words model
etl.bow_fit(corpus = df_train, type = "tf")
etl.save_vectorizers()

## 2. Category Prediction Task

In [11]:
# set category prediction dataset
X_train_category = etl.bow_transform(df_train)
y_train_category = etl.category_to_int(df_train)

In [12]:
# model for category classification
clf_category = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.2, n_jobs = -1, verbose = 2)\
    .fit(X_train_category, y_train_category)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 177 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.9min finished


In [13]:
clf_category.score(X_train_category, y_train_category)

0.9858765631846833

## 3-1. Type Prediction Task - Literal

In [14]:
# model for literal classification
# get which rows are for literal only  
train_literal_rows = (df_train["category"] == "literal").values
y_train_literal = etl.literal_to_int(df_train[train_literal_rows])

clf_literal = LogisticRegression(
    random_state=seed, penalty = 'elasticnet', solver = 'saga',
    l1_ratio = 0.5, n_jobs = -1, verbose = 2
    )\
    .fit(X_train_category[train_literal_rows, :], y_train_literal)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 3 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.4s finished


In [15]:
clf_literal.score(X_train_category[train_literal_rows], y_train_literal)

0.972822910578609

## 3-2. Type Prediction Task - Resource



***Identify types of resources in the train data***

In [16]:
levels = range(1, 6)

for l in levels:
    ind = 0
    temp_dict = {}
    temp_df = df_train[df_train["category"] == "resource"]["type"].map(lambda x: x[l-1] if len(x) >= l else "missing").to_frame(f"type{l}")
    for ontology in temp_df[f"type{l}"]:
        if ontology not in temp_dict:
            temp_dict[ontology] = ind 
            ind += 1
    with open(f"type{l}_map.json", "w") as outfile:
        temp_json_obj = json.dump(temp_dict, outfile)

In [17]:
resource_models = []

for l in range(1, 6):
    # model for literal classification
    # get which rows are for literal only  
    train_resource_rows = (df_train["category"] == "resource").values
    y_train_type = etl.type_to_int(df_train[train_resource_rows], type_no=l)

    clf_type = MLPClassifier(
      random_state=seed, max_iter=10, hidden_layer_sizes=(1000, 500, 300)
      , verbose = 2).\
    fit(X_train_category[train_resource_rows], y_train_type)

    resource_models.append(clf_type)

Iteration 1, loss = 2.26369687
Iteration 2, loss = 1.07734467
Iteration 3, loss = 0.65226379
Iteration 4, loss = 0.40591306
Iteration 5, loss = 0.27326443
Iteration 6, loss = 0.20030961
Iteration 7, loss = 0.16221546
Iteration 8, loss = 0.14199448
Iteration 9, loss = 0.12467818
Iteration 10, loss = 0.11484542




Iteration 1, loss = 2.08586444
Iteration 2, loss = 1.02620969
Iteration 3, loss = 0.63665499
Iteration 4, loss = 0.41644748
Iteration 5, loss = 0.29253741
Iteration 6, loss = 0.23276518
Iteration 7, loss = 0.19529822
Iteration 8, loss = 0.17462429
Iteration 9, loss = 0.16215116
Iteration 10, loss = 0.14617416




Iteration 1, loss = 1.81027879
Iteration 2, loss = 0.95233640
Iteration 3, loss = 0.64082126
Iteration 4, loss = 0.43626876
Iteration 5, loss = 0.32557114
Iteration 6, loss = 0.26302589
Iteration 7, loss = 0.21794966
Iteration 8, loss = 0.19488477
Iteration 9, loss = 0.17331330
Iteration 10, loss = 0.16427897




Iteration 1, loss = 1.50571014
Iteration 2, loss = 0.82552376
Iteration 3, loss = 0.56220561
Iteration 4, loss = 0.39355751
Iteration 5, loss = 0.29124314
Iteration 6, loss = 0.23135278
Iteration 7, loss = 0.19357294
Iteration 8, loss = 0.17457585
Iteration 9, loss = 0.15960374
Iteration 10, loss = 0.14643053




Iteration 1, loss = 0.98857775
Iteration 2, loss = 0.53605357
Iteration 3, loss = 0.38768069
Iteration 4, loss = 0.27935119
Iteration 5, loss = 0.21317966
Iteration 6, loss = 0.17591220
Iteration 7, loss = 0.15072062
Iteration 8, loss = 0.13822981
Iteration 9, loss = 0.12344304
Iteration 10, loss = 0.11715094




In [18]:
clf_type.score(X_train_category[train_resource_rows], y_train_type) 

0.971252399036302

## 4. Save models 

In [19]:
with open("category_model.pkl", "wb") as mdl_file:
    pickle.dump(clf_category, mdl_file)

In [20]:
with open("literal_model.pkl", "wb") as mdl_file:
    pickle.dump(clf_literal, mdl_file)

In [21]:
for l in range(1,6):
    with open(f"resource_level_{l}_model.pkl", "wb") as mdl_file:
        pickle.dump(resource_models[l-1], mdl_file)

## 5. Results & Evaluation

In [22]:
class ModelEvaluation:
    def __init__(self, etl_inst, cat_model, lit_model, res_models):
        self.etl_inst = etl_inst
        self.cat_model = cat_model
        self.lit_model = lit_model
        self.res_models = res_models
  
    # X is a df
    def get_predictions(self, X, bow_type = "tf"):

        X = X.copy()

        X.reset_index(inplace = True, drop = True)

        X_norm = self.etl_inst.norm_data(X)
        X_vec = self.etl_inst.bow_transform(X_norm, type = bow_type)

        bool_int = self.etl_inst.category_map["boolean"]
        literal_int = self.etl_inst.category_map["literal"]
        resource_int = self.etl_inst.category_map["resource"]

        cat_pred = self.cat_model.predict(X_vec)

        ind_bool = cat_pred == bool_int
        ind_literal = cat_pred == literal_int
        ind_resource = cat_pred == resource_int

        if len(ind_bool) > 0:
            X.loc[ind_bool, "cat_prediction"] = "boolean"
            X.loc[ind_bool, "type_prediction"] = pd.Series(
              cat_pred[ind_bool], name = "type_prediction")\
              .map(lambda x: ["boolean"]).values

        if len(ind_literal) > 0:
            X.loc[ind_literal, "cat_prediction"] = "literal"
            literal_pred = self.lit_model.predict(X_vec[ind_literal])
            X.loc[ind_literal, "type_prediction"] = pd.Series(
              literal_pred, name = "type_prediction")\
              .map(lambda x: [self.etl_inst.inv_literal_map[x]]).values

        if len(ind_resource) > 0:
            resource_preds = []
            for ind, type_model in enumerate(self.res_models):
                resource_preds.append(
                pd.Series(
                    type_model.predict(X_vec[ind_resource]), name = f"type_{ind}").\
                    map(lambda x: self.etl_inst.inv_type_maps[f"type{ind+1}"][x])
                    )
            resource_preds = pd.Series(pd.concat(resource_preds, axis = 1).values.tolist(), name = "type_prediction")
            X.loc[ind_resource, "type_prediction"] = resource_preds.values
            X.loc[ind_resource, "cat_prediction"] = "resource"
      
        return X

    def output_predictions(self):
        return NotImplementedError

In [23]:
me = ModelEvaluation(etl, clf_category, clf_literal, resource_models)

In [24]:
out_val = me.get_predictions(df_val)

In [25]:
output_format_val = out_val.loc[:, ["id", "cat_prediction", "type_prediction"]].to_dict(orient = "index")

In [26]:
output_format_val = [pred for ind, pred in output_format_val.items()]

In [27]:
output_format_val[:5]

[{'id': 27463,
  'cat_prediction': 'resource',
  'type_prediction': ['dbo:Planet',
   'missing',
   'missing',
   'missing',
   'missing']},
 {'id': 22675,
  'cat_prediction': 'resource',
  'type_prediction': ['dbo:EducationalInstitution',
   'dbo:Organisation',
   'dbo:Agent',
   'dbo:University',
   'missing']},
 {'id': 47013,
  'cat_prediction': 'resource',
  'type_prediction': ['dbo:Organisation',
   'dbo:Country',
   'dbo:Place',
   'dbo:Place',
   'dbo:Location']},
 {'id': 29074,
  'cat_prediction': 'resource',
  'type_prediction': ['dbo:Company',
   'dbo:Organisation',
   'dbo:Agent',
   'missing',
   'missing']},
 {'id': 2251,
  'cat_prediction': 'resource',
  'type_prediction': ['dbo:Country',
   'dbo:PopulatedPlace',
   'dbo:Place',
   'dbo:Place',
   'missing']}]

***Write output to file***

In [28]:
with open('system_output.json', 'w') as outfile:
    json.dump(output_format_val, outfile)