<a href="https://colab.research.google.com/github/chaeyoonyunakim/NLP-IR-QA/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Environment setup

In [1]:
from google.colab import drive # mount google drive
drive.mount('/content/drive') # authorization
%ls -l "/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-rw------- 1 root root 3601280 Feb 23  2021 '/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json'


In [2]:
!nvidia-smi

Thu Sep  9 14:08:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# basics
import pandas as pd
import json
import numpy as np
from numpy import mean
from numpy import std
np.random.seed(20211001)
import time

In [4]:
# nltk imports
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
#set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Algos
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [25]:
# scikit-learn Tools for modelling
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report, accuracy_score

In [7]:
# torch imports
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
from torchvision.utils import make_grid

In [8]:
# Check CPU / GPU environment
if torch.cuda.is_available():
    device = torch.device("cuda:0") # Training on GPU
    print("GPU is available")
else:
    device = torch.device("cpu") # Training on CPU
    print("GPU not available, CPU used")

GPU is available


## 1. Data Loading & Manipulation

***Tabular representation of the training dataset***

In [9]:
df_train = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json')
df_test = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_test.json')
df_train.head(3)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]


In [10]:
df_train.shape, df_test.shape

((17571, 4), (4381, 4))

In [11]:
df_test.isnull().sum()

id          0
question    0
category    0
type        0
dtype: int64

In [12]:
df_train.dropna(subset=['id', 'question', 'category'], inplace=True)
df_train.shape

(17528, 4)

In [13]:
text_train = df_train.question.values
text_test = df_test.question.values
text_test

array(['How many ingredients are in the grain} ?',
       'Is the case fatality rate of Fournier gangrene fewer than 9.0?',
       'Does the shelf life of spinach equal 8?', ...,
       'What is the location of Edmonton',
       'In which department does Raymond Baldwin work?',
       'What is Actorenregister ID for Utrecht University?'], dtype=object)

## 1. Preprocessing the data

***Clean the corpus***

In [14]:
# training dataset
vocab_train = []
tokens_train = []

for sent in text_train[:100]: # take sample 9000 to run a small batch
    x = word_tokenize(sent) # tokenization (strip sentences by word)
    sentence = [w.lower() for w in x if w.isalpha()] # lower alphabets (filtering non-string characters and then decapitalization)
 
    for word in sentence:
        if word not in vocab_train:
            vocab_train.append(word) # remove duplicates
            
for word in vocab_train:
    if word not in stopwords.words(): # filter stopwords out
        tokens_train.append(word)

In [15]:
# test dataset
vocab_test = []
tokens_test = []

for sent in text_test:
    x = word_tokenize(sent)
    sentence = [w.lower() for w in x if w.isalpha()]
 
    for word in sentence:
        if word not in vocab_test:
            vocab_test.append(word)
            
for word in vocab_test:
    if word in tokens_train:
        if word not in stopwords.words():
            tokens_test.append(word)

***Text normalization***

In [16]:
wordnet_lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

def normalize_word(w):
   word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
   word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
   word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
   word = ps.stem(word3)
   return word


## Define the Bag of Words model function
def create_bow(word_list):
  ind = 0 
  bow = {}
  for w in word_list:
    _w = normalize_word(w)
    if _w not in bow:
      bow[_w] = ind 
      ind += 1 
  return bow

bow_train = create_bow(tokens_train)
bow_test = create_bow(tokens_test)

***Vectorization***

In [17]:
## Assign an index to the word
label_map = {"boolean": 1, "literal":2, "resource":3}

def map_to_vec(df, bow):
  # add 1 for now just for the category
  # requires additional cols for literals and sub resources for later
  ncols = len(bow) + 1 
  data = np.zeros(shape = (df.shape[0], ncols))
  
  for i in range(df.shape[0]):
    # set the label
    data[i, -1] = label_map[df.iloc[i, 2]]
    # parse the sentence
    que = df.iloc[i, 1]
    for w in word_tokenize(que):
      w = w.lower()
      if w.isalpha():
        # normalize word
        w_norm = normalize_word(w)
        if w_norm in bow:
          # print(f"({i}, {w_norm})")
          data[i, bow[w_norm]] += 1 
  return data

vec_train = map_to_vec(df_train, bow_train)
vec_test = map_to_vec(df_test, bow_train)

## 2. Train & Test classification models

In [18]:
X_train = vec_train[:,:-1]
y_train = vec_train[:,-1]

X_test = vec_test[:,:-1]
y_test = vec_test[:,-1]

In [19]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((17528, 435), (17528,), (4381, 435), (4381,))

***Build a Classification Model 1***

In [20]:
training_time = []
test_time = []
test_acc = []

# define the support vector machine model
clf = svm.SVC(kernel = 'linear', random_state = 0, probability=True)

# fit the model on the whole dataset
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time() - t0
training_time.append(t1)

# predict the class label
t0 = time.time()
pred_clf = clf.predict(X_test)
t1 = time.time() - t0
test_time.append(t1)

# classification accuracy
test_acc.append(accuracy_score(y_test, pred_clf))

NameError: ignored

In [21]:
# classification accuracy
test_acc.append(accuracy_score(y_test, pred_clf))

***Build a Classification Model 2***

In [22]:
# define the multinomial logistic regression model
lrc = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# fit the model on the whole dataset
t0 = time.time()
lrc.fit(X_train, y_train)
t1 = time.time() - t0
training_time.append(t1)

# predict the class label
t0 = time.time()
pred_lrc = lrc.predict(X_test)
t1 = time.time() - t0
test_time.append(t1)

# classification accuracy
test_acc.append(accuracy_score(y_test, pred_lrc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


***Build a Classification Model 3***

In [23]:
# define the multi layer perceptrons model
mlpc = MLPClassifier(hidden_layer_sizes = (11, 11, 11), max_iter = 500)

# fit the model on the whole dataset
t0 = time.time()
mlpc.fit(X_train, y_train)
t1 = time.time() - t0
training_time.append(t1)

# predict the class label
t0 = time.time()
pred_mlpc = mlpc.predict(X_test)
t1 = time.time() - t0
test_time.append(t1)

# classification accuracy
test_acc.append(accuracy_score(y_test, pred_mlpc))

## 3. Results & Evaluation 

In [26]:
# results
print(classification_report(y_test, pred_mlpc))

              precision    recall  f1-score   support

         1.0       0.74      0.51      0.60       688
         2.0       0.82      0.62      0.71      1248
         3.0       0.75      0.91      0.82      2445

    accuracy                           0.77      4381
   macro avg       0.77      0.68      0.71      4381
weighted avg       0.77      0.77      0.76      4381



In [27]:
training_time, test_time, test_acc

([475.8613905906677, 5.536408424377441, 75.04947209358215],
 [17.699882984161377, 0.008712291717529297, 0.01678323745727539],
 [0.7671764437343073, 0.7701438027847524, 0.7655786350148368])

In [29]:
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_scores1 = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
n_scores2 = cross_val_score(lrc, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
n_scores3 = cross_val_score(mlpc, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)

In [30]:
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores1), std(n_scores1)))
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores2), std(n_scores2)))
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores3), std(n_scores3)))

Mean Accuracy: 0.760 (0.015)
Mean Accuracy: 0.766 (0.014)
Mean Accuracy: 0.734 (0.018)
