<a href="https://colab.research.google.com/github/chaeyoonyunakim/NLP-IR-QA/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Environment setup

In [1]:
from google.colab import drive # mount google drive
drive.mount('/content/drive') # authorization
%ls -l "/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-rw------- 1 root root 3601280 Feb 23  2021 '/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json'


In [2]:
!nvidia-smi

Sat Sep  4 13:16:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import pandas as pd
import json
import numpy as np
from numpy import mean
from numpy import std
import time

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
#set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

## 1. Data Loading & Manipulation

***Tabular representation of the training dataset***

In [6]:
df_train = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json')
df_test = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_test.json')
df_train.head(3)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]


In [7]:
df_train.shape, df_test.shape

((17571, 4), (4381, 4))

In [8]:
df_test.isnull().sum()

id          0
question    0
category    0
type        0
dtype: int64

In [9]:
df_train.dropna(subset=['id', 'question', 'category'], inplace=True)
df_train.shape

(17528, 4)

In [10]:
text_train = df_train.question.values
text_test = df_test.question.values
text_test

array(['How many ingredients are in the grain} ?',
       'Is the case fatality rate of Fournier gangrene fewer than 9.0?',
       'Does the shelf life of spinach equal 8?', ...,
       'What is the location of Edmonton',
       'In which department does Raymond Baldwin work?',
       'What is Actorenregister ID for Utrecht University?'], dtype=object)

## 1. Preprocessing the data

***Clean the corpus***

In [11]:
# training dataset
sentences_train = []
vocab_train = []
tokens_train = []

for sent in text_train[:5000]: # take sample 5000 to run a small batch
    x = word_tokenize(sent) # tokenization (strip sentences by word)
    sentence = [w.lower() for w in x if w.isalpha()] # lower alphabets (filtering non-string characters and then decapitalization)
    sentences_train.append(sentence)
 
    for word in sentence:
        if word not in vocab_train:
            vocab_train.append(word) # remove duplicates
            
for word in vocab_train:
    if word not in stopwords.words(): # filter stopwords out
        tokens_train.append(word)

In [12]:
# test dataset
sentences_test = []
vocab_test = []
tokens_test = []

for sent in text_test:
    x = word_tokenize(sent)
    sentence = [w.lower() for w in x if w.isalpha()]
    sentences_test.append(sentence)
 
    for word in sentence:
        if word not in vocab_test:
            vocab_test.append(word)
            
for word in vocab_test:
    if word not in stopwords.words():
        tokens_test.append(word)

In [13]:
tokens_train[:5]

['jacqueline', 'kennedy', 'onassis', 'follower', 'melkite']

***Text normalization***

In [14]:
wordnet_lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

def normalize_word(w):
   word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
   word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
   word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
   word = ps.stem(word3)
   return word


## Define the Bag of Words model function
def create_bow(word_list):
  ind = 0 
  bow = {}
  for w in word_list:
    _w = normalize_word(w)
    if _w not in bow:
      bow[_w] = ind 
      ind += 1 
  return bow

bow_train = create_bow(tokens_train)
bow_test = create_bow(tokens_test)

***Vectorization***

In [15]:
## Assign an index to the word
label_map = {"boolean": 1, "literal":2, "resource":3}

def map_to_vec(df, bow):
  # add 1 for now just for the category
  # requires additional cols for literals and sub resources for later
  ncols = len(bow) + 1 
  data = np.zeros(shape = (df.shape[0], ncols ))
  
  for i in range(df.shape[0]):
    # set the label
    data[i, -1] = label_map[df.iloc[i, 2]]
    # parse the sentence
    que = df.iloc[i, 1]
    for w in word_tokenize(que):
      w = w.lower()
      if w.isalpha():
        # normalize word
        w_norm = normalize_word(w)
        if w_norm in bow:
          # print(f"({i}, {w_norm})")
          data[i, bow[w_norm]] += 1 
  return data

X_train = map_to_vec(df_train, bow_train)
X_test = map_to_vec(df_test, bow_test)

In [16]:
X_test

array([[1., 1., 1., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 1., 1., 2.]])

## 2. Train a classification model

In [17]:
X1 = X_train[:,:-1]
y1 = X_train[:,-1]

X2 = X_test[:,:-1]
y2 = X_test[:,-1]

In [21]:
X2.shape, y2.shape

((4381, 7078), (4381,))

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.33, random_state=42)

***Build a Classifier 1***

In [19]:
classifier1 = SVC(kernel = 'linear', random_state = 0)
model1 = classifier1.fit(X_train, y_train)
y_hat = model1.predict(X_val)
precision_recall_fscore_support(y_val, y_hat, average='macro')

(0.8603667791385817, 0.8711753659616533, 0.8638245824194781, None)

***Build a Classifier 2***

In [24]:
# define the multinomial logistic regression model
classifier2 = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# fit the model on the whole dataset
model2 = classifier2.fit(X_train, y_train)

# predict the class label
y_hat = model2.predict(X_val)

precision_recall_fscore_support(y_val, y_hat, average='macro')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.8732083848898715, 0.8742628808802082, 0.8717894933007084, None)

## 3. Results & Evaluation 

In [26]:
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_score1 = cross_val_score(model1, X2, y2, scoring='accuracy', cv=cv, n_jobs=-1)
n_score2 = cross_val_score(model2, X2, y2, scoring='accuracy', cv=cv, n_jobs=-1)

In [27]:
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_score1), std(n_score1)))
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_score2), std(n_score2)))

Mean Accuracy: 0.804 (0.014)
Mean Accuracy: 0.817 (0.011)
