In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import yaml

In [2]:
def read_yaml(yaml_file):
    '''read yaml file 
    inpput - config file name and path'''
    data_dict = None
    with open(yaml_file) as f:
        data_dict = yaml.safe_load(f)
    return data_dict

In [3]:
# read the data config file
yaml_file_name = 'data_config.yaml'
data_dict = read_yaml(yaml_file_name)

In [4]:
# save the config params for data
test_percent = int(data_dict['test_percent'])
author_data_choice = data_dict['author_data_choice']
data_augmentation = data_dict['data_augmentation']

In [5]:
# print the config received for data
print(" ====== data config ======= ")
print("test percent : ", test_percent , type(test_percent))
print("author_data_choice : ", author_data_choice, type(author_data_choice))
print("data_augmentation : ", data_augmentation, type(data_augmentation))

test percent :  30 <class 'int'>
author_data_choice :  True <class 'bool'>
data_augmentation :  True <class 'bool'>


In [6]:
# read the SVM config file
yaml_file_name = 'SVM_classifier_config.yaml'
data_dict = read_yaml(yaml_file_name)


In [7]:
# save the config params for SVM
kernel_name_SVM = data_dict['kernel_name_SVM']
balanced_SVM = data_dict['balanced_SVM']


In [8]:
# print the config received for SVM
print("======== SVM Classifier config ==========")
print("kernel_name_SVM : ", kernel_name_SVM, type(kernel_name_SVM))
print("balanced_SVM : ", balanced_SVM, type(balanced_SVM))

kernel_name_SVM :  sigmoid <class 'str'>
balanced_SVM :  False <class 'bool'>


In [9]:
# read the MLP config file
yaml_file_name = 'MLP_classifier_config.yaml'
data_dict = read_yaml(yaml_file_name)

In [10]:
# save the config params for MLP, as number of neruons in layer mentioned as (,,) we need to convert them to list to give it to further MLP function
hidden_layer_sizes_MLP = []
layer_sizes = data_dict['hidden_layer_sizes_MLP'].split(",") # split on ","
for i in range(len(layer_sizes)):
    size = layer_sizes[i]
    if i ==0:
        hidden_layer_sizes_MLP.append(int(size[1:])) # for first remove ( in (no_neuron and append to list
    elif i == len(layer_sizes)-1:
        hidden_layer_sizes_MLP.append(int(size[:-1])) # for last remove ) in no_neuron) and append to list
    else:
         hidden_layer_sizes_MLP.append(int(size))  #  append no_neuron to list
hidden_layer_sizes_MLP = tuple(hidden_layer_sizes_MLP)

In [11]:
# save the config hyperparams for MLP
learning_rate_init_MLP=data_dict['learning_rate_init_MLP']
max_iter_MLP=data_dict['max_iter_MLP']


In [12]:
# print the config received for MLP
print("========= MLP classifier config =========== ")
print("hidden_layer_sizes_MLP : " , hidden_layer_sizes_MLP, type(hidden_layer_sizes_MLP))
print("learning_rate_init_MLP : ", learning_rate_init_MLP, type(learning_rate_init_MLP))
print("max_iter_MLP : ", max_iter_MLP, type(max_iter_MLP))

hidden_layer_sizes_MLP :  (64, 32) <class 'tuple'>
learning_rate_init_MLP :  0.01 <class 'float'>
max_iter_MLP :  100 <class 'int'>


In [13]:
# read yaml config file for test 
yaml_file_name = 'test_config.yaml'
data_dict = read_yaml(yaml_file_name)

In [14]:
# save the config params for test
row_from_author_data = data_dict['row_from_author_data']
test_question = data_dict['test_question']


In [15]:
# print the params for test 
print("===== test config ========" )
print("row_from_author_data : ", row_from_author_data , type(row_from_author_data))
print("test_question : ", test_question, type(test_question))

row_from_author_data :  [5, 15, 150] <class 'list'>
test_question :  how bad is covid pandemic <class 'str'>


In [16]:
def SVM_classifier(kernel_name, class_balanced = False):
    '''invokes appropriate SVM classfier
    input - kernel to use, whether to balance class'''
    if class_balanced == True:
        # SVC with particular kernal and class balance = True
        SVC1 = SVC(kernel=kernel_name, class_weight='balanced')
        return SVC1
    else:
        # SVC with particular kernal and class balance = False(default)
        SVC1 = SVC(kernel=kernel_name)
        return SVC1

In [17]:
def get_accuracy_score(X,y, classifier='SVC'):
    '''get accuracy for mentioned classifier 
    input - input, correct output, classifier (default - SVC) - SVC or MLP'''
    if (classifier=='SVC'):
        # predict using SVC and input
        predict = svcclassifier.predict(X)
        # return accuracy score
        return accuracy_score(y, predict)
    if (classifier=='MLP'):
         # predict using MLP and input
        predict = mlpclassifier.predict(X)
        # return accuracy score
        return accuracy_score(y, predict)
        

In [18]:
def test_on_data(sample, classifier='SVC'):
    '''return prediction for test question using classifier 
    input - test sample question, classifier'''
    if (classifier=='SVC'):
        # fit count vectoriser and tf-idf vectorizer to input sentence
        test_ex_count = count_vect.transform([sample])
        test_ex_tfidf = tfidf_transformer.transform(test_ex_count)
        # return the prediction of class using SVC
        return svcclassifier.predict(test_ex_tfidf)[0]
    if (classifier=='MLP'):
        # fit count vectoriser and tf-idf vectorizer to input sentence
        test_ex_count = count_vect.transform([sample])
        test_ex_tfidf = tfidf_transformer.transform(test_ex_count)
        # return the prediction of class using MLP
        return svcclassifier.predict(test_ex_tfidf)[0]

In [19]:
def get_trained_data(augmented=False):
    '''get data for training
    input - augmented param as boolean'''
    if (augmented==True):
        # read augmented trained data if augmented = True 
        df_train = pd.read_csv("data_generated/train_data_augmented.csv")
        print("train data shape : ", df_train.shape)
        return df_train
    else:
         # read original non-augmented trained data if augmented = False 
        df_train = pd.read_csv("data_generated/train_data.csv")
        print("train data shape : ", df_train.shape)
        return df_train

In [20]:
def get_test_data(author=True):
    '''get test data, source generated to author generated
    input - author generated boolean'''
    if (author==True):
        # if author = True get both test data and author generated test data
        df_test = pd.read_csv("./data_generated/test_data.csv")
        df_test_author = pd.read_csv("./data_generated/test_data_author_generated.csv")
        return df_test, df_test_author
    else:
        # if author = False get only test data
        df_test = pd.read_csv("./data_generated/test_data.csv")
        return df_test, None

In [21]:
def print_results_on_data(row_from_author_data, classifier='SVC'):
    '''print the predictions for data 
    input - row numbers from author generated data, classifier to use'''
    for i in row_from_author_data:
        # if row more than number of rows in author generated data set it max row number
        if (i > 248):
            i = 248
        # if author choice true , get data for that row number and predict using given classifier, default, SVC
        if author_data_choice == True:
            sample = df_test_author['Question'].iloc[i]
            print("Question : ", df_test_author['Question'].iloc[i])
            print("predicted : ", test_on_data(sample , classifier))
            print("correct : " , df_test_author['Category'].iloc[i])
        

## tf-idf feature generation 

In [22]:
# get trained data
df_train = get_trained_data(augmented=data_augmentation)

train data shape :  (2887, 3)


In [23]:
# count vectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train['Question'])
# X_train_counts.shape

In [24]:
# tf-idf transform on count vectorizer data
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [25]:
y_train = df_train['Category']
# y_train
# df_train

In [26]:
print("feature shapes for train data : ", X_train_tfidf.shape, y_train.shape)

feature shapes for train data :  (2887, 3840) (2887,)


In [27]:
# get test data, print shapes according to choice of test dsata - author test data or only test data
df_test, df_test_author = get_test_data(author_data_choice)
print("test data shape : ", df_test.shape)
if author_data_choice == True:
    print("author test data shape : ", df_test_author.shape)


test data shape :  (691, 2)
author test data shape :  (249, 2)


In [28]:
# saved count vectorizer and tf-idf transform on test data
test_count = count_vect.transform(df_test['Question'])
X_test_tfidf = tfidf_transformer.transform(test_count)
y_test = df_test['Category']

In [29]:
print("feature shapes for test data : ", X_test_tfidf.shape, y_test.shape)

feature shapes for test data :  (691, 3840) (691,)


In [30]:
# saved count vectorizer and tf-idf transform on author test data if we are using it
X_test_author_tfidf = None
y_test_author = None
if (author_data_choice == True):
    df_test_author['Question'] = df_test_author['Question'].astype(str)
    test_author_count = count_vect.transform(df_test_author['Question'])
    X_test_author_tfidf = tfidf_transformer.transform(test_author_count)
    y_test_author = df_test_author['Category']
    print("feature shapes for author test data : ", X_test_author_tfidf.shape, y_test_author.shape)

feature shapes for author test data :  (249, 3840) (249,)


## SVM classifier

In [31]:
# initialize classifier using kernel name and class balance
svcclassifier = SVM_classifier(kernel_name_SVM, balanced_SVM )

In [32]:
# fit the classifier
svcclassifier.fit(X_train_tfidf, y_train)

SVC(kernel='sigmoid')

In [33]:
# get training accuracy 
train_accuracy = get_accuracy_score(X_train_tfidf, y_train)
print("train accuracy : ", train_accuracy)

train accuracy :  0.9954970557672325


In [34]:
# get testing accuracy 

test_accuracy = get_accuracy_score(X_test_tfidf, y_test)
print("test accuracy : ", test_accuracy)

test accuracy :  0.48625180897250364


In [35]:
# get testing accuracy on author generated text data

if author_data_choice == True:
    test_author_accuracy = get_accuracy_score(X_test_author_tfidf, y_test_author)
    print("author test accuracy : ", test_author_accuracy)

author test accuracy :  0.4819277108433735


In [36]:
# print results on data for mentined rows in test config file
print_results_on_data(row_from_author_data)

Question :  is covid the end of humanity
predicted :  Societal Effects 
correct :  Societal Effects 
Question :  how is covid prevented
predicted :  Testing 
correct :  Prevention 
Question :  how can i tell if i have the flu or covid
predicted :  Comparison 
correct :  Comparison 


In [37]:
# test on given test_question
pred = test_on_data(test_question)
print("Question : ", test_question)
print("predicted : ", pred)

Question :  how bad is covid pandemic
predicted :  Economic Effects 


## MLP classifier

In [38]:
# initialize MLP classifier using number of layers
mlpclassifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_MLP, learning_rate_init=learning_rate_init_MLP, max_iter=max_iter_MLP)

In [39]:
# fit the classifier
mlpclassifier.fit(X_train_tfidf, y_train)

MLPClassifier(hidden_layer_sizes=(64, 32), learning_rate_init=0.01,
              max_iter=100)

In [40]:
# get training accuracy 
train_accuracy = get_accuracy_score(X_train_tfidf, y_train, 'MLP')
print("train accuracy : ", train_accuracy)

train accuracy :  0.9996536196744025


In [41]:
# get accuracy on test data
test_accuracy = get_accuracy_score(X_test_tfidf, y_test,'MLP')
print("test accuracy : ", test_accuracy)

test accuracy :  0.5224312590448625


In [42]:
# get accuracy on author generated test data
if author_data_choice == True:  
    test_author_accuracy = get_accuracy_score(X_test_author_tfidf, y_test_author,'MLP')
    print("author test accuracy : ", test_author_accuracy)

author test accuracy :  0.4457831325301205


In [43]:
# print results on data for mentined rows in test config file
print_results_on_data(row_from_author_data,'MLP')

Question :  is covid the end of humanity
predicted :  Societal Effects 
correct :  Societal Effects 
Question :  how is covid prevented
predicted :  Testing 
correct :  Prevention 
Question :  how can i tell if i have the flu or covid
predicted :  Comparison 
correct :  Comparison 


In [44]:
# test on given test_question
pred = test_on_data(test_question,'MLP')
print("Question : ", test_question)
print("predicted : ", pred)

Question :  how bad is covid pandemic
predicted :  Economic Effects 
