<a href="https://colab.research.google.com/github/chaeyoonyunakim/NLP-IR-QA/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%ls -l "/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json"

-rw------- 1 root root 3601280 Feb 23  2021 '/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json'


In [3]:
import pandas as pd
import json
import numpy as np
from numpy import mean
from numpy import std

In [4]:
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
#set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Tabular representation of the training dataset

In [8]:
df = pd.read_json('/content/drive/My Drive/2021_INM363_SMART/smarttask_dbpedia_train.json')
df.head(3)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]


In [9]:
df.shape

(17571, 4)

In [10]:
df.isnull().sum()

id           0
question    43
category     0
type         0
dtype: int64

In [11]:
df.dropna(subset=['id', 'question'], inplace=True)
df.shape

(17528, 4)

In [12]:
text = df.question.values
#testdata = text.tolist()

## 1. Preprocessing the data

In [13]:
# clean the corpus
sentences = []
vocab = []
tokens = []

for sent in text:
    x = word_tokenize(sent)
    sentence = [w.lower() for w in x if w.isalpha()]
    sentences.append(sentence)
 
    for word in sentence:
        if word not in vocab:
            vocab.append(word)
            
for word in vocab:
    if word not in stopwords.words():
        tokens.append(word)

In [14]:
wordnet_lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

def normalize_word(w):
   word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
   word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
   word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
   word = ps.stem(word3)
   return word


## Define the Bag of Words model function
def create_bow(word_list):
  ind = 0 
  bow = {}
  for w in word_list:
    _w = normalize_word(w)
    if _w not in bow:
      bow[_w] = ind 
      ind += 1 
  return bow
bow = create_bow(tokens)


## Assign an index to the word
label_map = {"boolean": 1, "resource":2, "literal":3}

def map_to_vec(df, bow):
  # add 1 for now just for the category
  # requires additional cols for literals and sub resources for later
  ncols = len(bow) + 1 
  data = np.zeros(shape = (df.shape[0], ncols ))
  
  for i in range(df.shape[0]):
    # set the label
    data[i, -1] = label_map[df.iloc[i, 2]]
    # parse the sentence
    que = df.iloc[i, 1]
    for w in word_tokenize(que):
      w = w.lower()
      if w.isalpha():
        # normalize word
        w_norm = normalize_word(w)
        if w_norm in bow:
          # print(f"({i}, {w_norm})")
          data[i, bow[w_norm]] += 1 
  return data

data = map_to_vec(df, bow)

In [15]:
data

array([[1., 1., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 3.],
       ...,
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 1., 2.],
       [0., 0., 0., ..., 0., 0., 3.]])

## 2. Train a classification model

In [16]:
X = data[:,:-1]
y = data[:,-1]

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

***Build a Classifier***

In [18]:
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [19]:
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Mean Accuracy: 0.880 (0.008)
