# Answer Evaluation

## Import the modules

In [89]:
import os
import numpy as np
import re
from importlib import reload
import logging
from logging.config import dictConfig
from sklearn.model_selection import train_test_split
from nlp_utils import read_dataset
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from nltk import download
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer

## Define global variables/setting

In [98]:
# Setup logging
reload(logging)
#logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')
dictConfig({
    'version': 1,
    'formatters': {'default': {
        'format': '[%(asctime)s] %(levelname)s: %(message)s',
    }},
    'handlers': {'log_file': {
        'class': 'logging.handlers.RotatingFileHandler',
        'filename': 'nlp_answer_evaluation.log',
        'formatter': 'default',
        'maxBytes': 1024 * 1024 * 20,
        'backupCount': 3
        },
        'log_stream': {
            'class': 'logging.StreamHandler',
            'formatter': 'default'
        }
    },
    'root': {
        'level': 'INFO',
        'handlers': ['log_file', 'log_stream']
    }
})


np.random.seed(1) # Set seed value for numpy random
stemmer = LancasterStemmer() # Declare stemmer object
download('stopwords')
stop_words = set(stopwords.words('english')) # Set stop words for english
dataset_dir = 'dataset' # Set dataset directory
models_dir = 'models' # Set models directory to save/load
vectorizer_dir = 'vectorizer' # Set vectorizer directory to save/load
file_cache = False # Set flag to use file cache or not

## Define common functions

In [99]:
def save_object(directory, filename, data):
    '''
    Saves an object to file using joblib.

    Args:
        directory (str): Path to the directory.
        filename (str): Name of file.
        data : Object/data to be saved.
    '''
    file_to_save = os.path.join(directory, filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    joblib.dump(data, file_to_save)
    logging.debug('Saved data to file {}'.format(file_to_save))

def load_object(directory, filename):
    '''
    Loads a model from file using joblib.

    Args:
        directory (str): Path to the directory.
        filename (str): Name of file.
    
    Returns:
        data : Object/data that's loaded from given file.
    '''
    file_to_load = os.path.join(directory, filename)
    data= joblib.load(file_to_load)
    logging.debug('Loaded file {}'.format(file_to_load))
    return data

## Load Data

In [100]:
try:
    datasets = read_dataset(dataset_dir) # Read the datasets from directory
except Exception as e:
    raise

## Preprocess Data

In [101]:
def preprocess_dataset(sentence_list):
    '''
    Performs stemming on the text dataset.

    Args:
        sentence_list (list): List of the sentences.

    Returns:
        stemmed_sentences (list): Sentences after stemming the words.
    '''
    stemmed_sentences = []
    for sentence in sentence_list:
        sentence = sentence.lower() # Lowercase the words
        sentence = re.sub(r'\W', ' ', sentence) # Remove 
        sentence = re.sub(r'\s+', ' ', sentence, flags=re.I) # Remove extra spaces
        sentence = sentence.split() # Split the sentence
        stemmed_tokens = [stemmer.stem(word) for word in sentence] # Perform stemming on the words
        stemmed_sentences.append(' '.join(stemmed_tokens)) # Join the stemmed words back
    return stemmed_sentences

## Transform data (vectorize)

In [102]:
def build_sentence_vectors(question_id, sentence_list):
    '''
    Creates a vector representation of the sentences using bag of words approach.

    Args:
        question_id (str): ID of the Question
        sentence_list (list): List of the sentences.

    Returns:
        sentence_vector_list (list): Vectors for all the sentences.
    '''
    # Initialize vectorizer to preprocess, tokenize and vectorize dataset
    count_vector = CountVectorizer(binary=True, stop_words=stop_words)
    count_vector = count_vector.fit(sentence_list) # Fit the sentence list on vectorizer
    try:
        save_object(vectorizer_dir, question_id, count_vector) # Save the vectorizer to file
    except:
        logging.error('Unable to save vectorizer to file for question {}'.format(question_id))
    sentence_vector_list = count_vector.transform(sentence_list) # Generate vectors for sentences
    return sentence_vector_list

## Train the model

In [103]:
def train(dataset, test_dataset_size= 0.2):
    '''
    Trains model using the dataset and returns the trained model.

    Args:
        dataset (dict): The dataset to process.
        test_dataset_size (float): Size of test dataset while splitting (0 to 1).

    Returns:
        answer_classifier (model): The model that's trained using dataset (using SVM classifier).
        score (float): Accuracy score for the model (0 to 1).
    '''
    answers = [dataset['reference_answer']['text']] # Fetch the reference answer
    answers.extend(dataset['answers']['sentences']) # Add all answers to the input
    output_data = ['correct'] # Set the class for reference answer as correct
    output_data.extend(dataset['answers']['classes']) # Add all other values to output
    cleaned_answers = preprocess_dataset(answers) # Perform preprocessing of dataset
    answers_matrix = build_sentence_vectors(dataset['question']['id'],cleaned_answers) # Transform the dataset
    # Split the dataset to train and split
    x_train, x_test, y_train, y_test = train_test_split(answers_matrix, output_data,
                                                        test_size= test_dataset_size, random_state=0)
    answer_classifier = SVC(kernel='linear').fit(x_train, y_train) # Fit the data on SVM classifier
    y_pred = answer_classifier.predict(x_test) # Predict the classes for test dataset
    score = accuracy_score(y_test, y_pred) # Score the model
    logging.debug('Score for dataset with question id {} is {}'.format(dataset['question']['id'], score))
    return answer_classifier, score

## Generate model for datasets

In [104]:
accuracy_list = []
for dataset in datasets:
    dataset_name = dataset['question']['id'] # Get question ID
    model_file = os.path.join(models_dir, dataset_name)
    if file_cache and os.path.isfile(model_file): # Check if cache is enabled and file exists
        continue
    model, score = train(dataset) # Train the model using dataset
    save_object(models_dir, dataset_name, model) # Save the model
    accuracy_list.append(score)

if len(accuracy_list) > 0:
    logging.info("The average accuracy over all datasets trained now is {}".format(np.mean(accuracy_list)*100))
else:
    logging.info("No new dataset to process! Please set file_cache to False to reprocess the datasets.")

[2019-07-09 12:58:07,834] INFO: The average accuracy over all datasets trained now is 82.51851851851852


## Classify new data

In [None]:
def classify(question_id, answer):
    '''
    Classify an answer as correct or incorrect, given its question ID.

    Args:
        question_id (str): The ID of the question for which answer is provided.
        answer (str): Answer to be classified.

    Returns:
        predicted_class (str): Predicted class of the answer (correct or incorrect).
        None : If there was any error.
    '''
    model_to_load = os.path.join(models_dir, question_id)
    vectorizer_to_load = os.path.join(vectorizer_dir, question_id)
    if os.path.isfile(model_to_load) and os.path.isfile(vectorizer_to_load):
        try:
            model = load_object(models_dir, question_id)
            cleaned_answer = preprocess_dataset([answer])
            vectorizer = load_object(vectorizer_dir, question_id)
            answer_vector = vectorizer.transform(cleaned_answer)
            predicted_class = model.predict(answer_vector)[0]
            return predicted_class
        except:
            return None
    else:
        return None

question_id = str(input('Please enter Question ID :')).lower()
answer = str(input('Please enter answer : '))
answer_class = classify(question_id, answer)
if answer_class is None:
    print("Unable to load model/vectorizer data. Please check if the question ID is valid.")
else:
    print("The answer is predicted as : {}".format(answer_class))