## 0. Environment setup

In [1]:
# basics
import pandas as pd
import json
import numpy as np
from numpy import mean
from numpy import std
np.random.seed(20211001)
import time

In [2]:
# nltk imports
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
#set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Algos
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [4]:
# scikit-learn Tools for modelling
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report, accuracy_score

## 1. Data Loading & Manipulation

### Tabular representation of the training dataset

In [5]:
df_train = pd.read_json('C:/Users/chaey/INM713-python-main/smart-dataset-master/datasets/DBpedia/smarttask_dbpedia_train.json')
df_test = pd.read_json('C:/Users/chaey/INM713-python-main/smart-dataset-master/datasets/DBpedia/smarttask_dbpedia_test.json')
df_train.head(3)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]


In [6]:
df_train.shape, df_test.shape

((17571, 4), (4381, 4))

In [7]:
df_train.isnull().sum()

id           0
question    43
category     0
type         0
dtype: int64

In [8]:
df_train.dropna(subset=['id', 'question', 'category'], inplace=True)
df_train.shape

(17528, 4)

In [9]:
df_test.head(10)

Unnamed: 0,id,question,category,type
0,dbpedia_16015,How many ingredients are in the grain} ?,literal,[number]
1,dbpedia_3885,Is the case fatality rate of Fournier gangrene...,boolean,[boolean]
2,dbpedia_12907,Does the shelf life of spinach equal 8?,boolean,[boolean]
3,dbpedia_7955,What sound does a pig make in the French langu...,literal,[string]
4,dbpedia_2376,When was Fergie completed his record label in ...,literal,[date]
5,dbpedia_4197,Which are the coordinates of easternmost point...,literal,[string]
6,dbpedia_22599,Where did the war take place where one of the ...,resource,"[dbo:Country, dbo:PopulatedPlace, dbo:Place]"
7,dbpedia_5469,Was Michael Bloomberg a residence in Medford a...,boolean,[boolean]
8,dbpedia_687,How many platforms does Tomb Raider have?,literal,[number]
9,dbpedia_19677,Which mountains are contained in Inyo National...,resource,"[dbo:Mountain, dbo:NaturalPlace, dbo:Place]"


In [10]:
df_test.iloc[6, 3][:]

['dbo:Country', 'dbo:PopulatedPlace', 'dbo:Place']

## 1. Preprocessing the data

***Question Parsing***

In [11]:
text_train = df_train.question.values
text_test = df_test.question.values
text_test

array(['How many ingredients are in the grain} ?',
       'Is the case fatality rate of Fournier gangrene fewer than 9.0?',
       'Does the shelf life of spinach equal 8?', ...,
       'What is the location of Edmonton',
       'In which department does Raymond Baldwin work?',
       'What is Actorenregister ID for Utrecht University?'], dtype=object)

***Clean the corpus***

In [None]:
# training dataset
vocab_train = []
tokens_train = []

for sent in text_train:
    x = word_tokenize(sent) # tokenization (strip sentences by word)
    sentence = [w.lower() for w in x if w.isalpha()] # lower alphabets (filtering non-string characters and then decapitalization)
 
    for word in sentence:
        if word not in vocab_train:
            vocab_train.append(word) # remove duplicates
            
for word in vocab_train:
    if word not in stopwords.words(): # filter stopwords out
        tokens_train.append(word)

In [None]:
# test dataset
vocab_test = []
tokens_test = []

for sent in text_test:
    x = word_tokenize(sent)
    sentence = [w.lower() for w in x if w.isalpha()]
 
    for word in sentence:
        if word not in vocab_test:
            vocab_test.append(word)
            
for word in vocab_test:
    if word in tokens_train:
        if word not in stopwords.words():
            tokens_test.append(word)

***Text normalization***

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

def normalize_word(w):
    word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
    word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
    word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
    word = ps.stem(word3)
    return word


## Define the Bag of Words model function
def create_bow(word_list):
    ind = 0 
    bow = {}
    for w in word_list:
        _w = normalize_word(w)
        if _w not in bow:
            bow[_w] = ind 
            ind += 1 
    return bow

bow_train = create_bow(tokens_train)
bow_test = create_bow(tokens_test)

***Vectorization***

In [None]:
## Assign an index to the word
label_map = {"boolean": 1, "literal":2, "resource":3}

def map_to_vec(df, bow):
    # add 1 for now just for the category
    # requires additional cols for literals and sub resources for later
    ncols = len(bow) + 1 
    data = np.zeros(shape = (df.shape[0], ncols))
  
    for i in range(df.shape[0]):
        # set the label
        data[i, -1] = label_map[df.iloc[i, 2]]
        # parse the sentence
        que = df.iloc[i, 1]
        for w in word_tokenize(que):
            w = w.lower()
            if w.isalpha():
                # normalize word
                w_norm = normalize_word(w)
                if w_norm in bow:
                    # print(f"({i}, {w_norm})")
                    data[i, bow[w_norm]] += 1 
    return data

vec_train = map_to_vec(df_train, bow_train)
vec_test = map_to_vec(df_test, bow_train)

In [None]:
# Index dictionary to assign an index to each type in dataset

def sub_to_vec(df, vec):
    target = np.zeros(shape = (df.shape[0], 1))
    
    for i in range(df.shape[0]):
        if vec[i,-1] <= 2:
            # set the label
            type_map = {"boolean": 1, "date":2, "number":3, "string":4}
            target[i, -1] = type_map[df.iloc[i, 3][0]]
    return target

y_train = sub_to_vec(df_train, vec_train)
y_test = sub_to_vec(df_test, vec_test)

## 2. Train & Test classification models

In [None]:
X_train = vec_train[:,:-1]
y_train = y_train

X_test = vec_test[:,:-1]
y_test = y_test

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

***Build a Classification Model 1***

In [110]:
training_time = []
test_time = []
test_acc = []

# define the support vector machine model
clf = svm.SVC(kernel = 'linear', random_state = 0, probability=True)

# fit the model on the whole dataset
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time() - t0
training_time.append(t1)

# predict the class label
t0 = time.time()
pred_clf = clf.predict(X_test)
t1 = time.time() - t0
test_time.append(t1)

# classification accuracy
test_acc.append(accuracy_score(y_test, pred_clf))

  return f(**kwargs)


***Build a Classification Model 2***

In [111]:
# define the multinomial logistic regression model
lrc = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# fit the model on the whole dataset
t0 = time.time()
lrc.fit(X_train, y_train)
t1 = time.time() - t0
training_time.append(t1)

# predict the class label
t0 = time.time()
pred_lrc = lrc.predict(X_test)
t1 = time.time() - t0
test_time.append(t1)

# classification accuracy
test_acc.append(accuracy_score(y_test, pred_lrc))

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


***Build a Classification Model 3***

In [112]:
# define the multi layer perceptrons model
mlpc = MLPClassifier(hidden_layer_sizes = (11, 11, 11), max_iter = 500)

# fit the model on the whole dataset
t0 = time.time()
mlpc.fit(X_train, y_train)
t1 = time.time() - t0
training_time.append(t1)

# predict the class label
t0 = time.time()
pred_mlpc = mlpc.predict(X_test)
t1 = time.time() - t0
test_time.append(t1)

# classification accuracy
test_acc.append(accuracy_score(y_test, pred_mlpc))

  return f(**kwargs)


## 3. Results & Evaluation

In [113]:
# results
print(classification_report(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.89      0.94      0.91      2445
         1.0       0.86      0.86      0.86       688
         2.0       0.77      0.70      0.73       316
         3.0       0.89      0.78      0.83       407
         4.0       0.93      0.80      0.86       525

    accuracy                           0.88      4381
   macro avg       0.87      0.82      0.84      4381
weighted avg       0.88      0.88      0.88      4381



In [114]:
print(classification_report(y_test, pred_lrc))

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92      2445
         1.0       0.85      0.89      0.87       688
         2.0       0.81      0.72      0.76       316
         3.0       0.92      0.75      0.83       407
         4.0       0.96      0.78      0.86       525

    accuracy                           0.89      4381
   macro avg       0.89      0.82      0.85      4381
weighted avg       0.89      0.89      0.89      4381



In [115]:
print(classification_report(y_test, pred_mlpc))

              precision    recall  f1-score   support

         0.0       0.86      0.91      0.89      2445
         1.0       0.83      0.75      0.79       688
         2.0       0.64      0.59      0.61       316
         3.0       0.76      0.74      0.75       407
         4.0       0.84      0.78      0.81       525

    accuracy                           0.83      4381
   macro avg       0.79      0.76      0.77      4381
weighted avg       0.83      0.83      0.83      4381



In [116]:
training_time, test_time, test_acc

([14734.663571357727, 73.42798662185669, 178.94628405570984],
 [743.3649921417236, 1.3010668754577637, 0.7436397075653076],
 [0.8801643460397169, 0.8886099064140607, 0.8326866012325953])

In [117]:
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_scores1 = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
n_scores2 = cross_val_score(lrc, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
n_scores3 = cross_val_score(mlpc, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)

MemoryError: Unable to allocate 498. MiB for an array with shape (3943, 16546) and data type float64

In [None]:
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores1), std(n_scores1)))
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores2), std(n_scores2)))
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores3), std(n_scores3)))