In [1]:
import pandas as pd, math, csv
from sklearn.model_selection import KFold
from csv import reader
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# import training data from trg.csv
def import_train_data():
    data = pd.read_csv("trg.csv")
    # get list of column names
    headers = list(data.columns.values)
    # separate into target and abstract values
    target = data[headers[1]]
    abstract = data[headers[2]]
    return target, abstract

# import data to be predicted from tst.csv
def import_predict_data():
    data = pd.read_csv("tst.csv")
    # get list of column names
    headers = list(data.columns.values)
    # get abstract values
    abstract = data[headers[1]]
    return abstract

# train model using abstract and target values
def train(abstract_train, target_train):
    
    # declare variables
    aCount, bCount, eCount, vCount = 0,0,0,0
    abstractCount = 0
    wordDict = dict()
    
    # for each abstract
    for i in range(abstract_train.size):
        abstractCount += 1
        
        # count frequency of each target value A, B, E, V
        target = target_train.iloc[i]
        if (target == "A"):
            aCount += 1
        elif (target == "B"):
            bCount += 1
        elif (target == "E"):
            eCount += 1
        elif (target == "V"):
            vCount += 1
        
        # get list of words in abstract
        wordList = abstract_train.iloc[i].split()
        
        for word in wordList:
            # convert word to lowercase
            word = word.lower()
            # eliminate word if it is a stop word or digit
            if (word not in stopwordList and not word.isdigit()):
                # lemmatize word to get root word
                word = lemmatizer.lemmatize(word)
                # add word to wordDict and add one to corresponding target value
                if word not in wordDict:
                    wordDict[word] = {"A": 0, "B": 0, "E": 0, "V": 0}
                wordDict[word][target] += 1
                        
    # P(c) for each target value A, B, E, V
    probA = aCount/float(abstractCount)
    probB = bCount/float(abstractCount)
    probE = eCount/float(abstractCount)
    probV = vCount/float(abstractCount)

    # count(c) for each target value A, B, E, V
    numWordsA, numWordsB, numWordsE, numWordsV = 0,0,0,0
    for word in wordDict:
        for target in wordDict[word]:
            if (target == "A"):
                numWordsA += wordDict[word][target]
            elif (target == "B"):
                numWordsB += wordDict[word][target]
            elif (target == "E"):
                numWordsE += wordDict[word][target]
            else:
                numWordsV += wordDict[word][target]
    
    result_dict = {"wordDict": wordDict, "probA": probA, "probB": probB, "probE": probE, "probV": probV, \
                   "numWordsA": numWordsA, "numWordsB": numWordsB, "numWordsE": numWordsE, "numWordsV": numWordsV}
    return result_dict

# predict target values of abstracts
def predict(abstract, train_results):
    
    # get variables from training results
    wordDict = train_results.get("wordDict")
    probA = train_results.get("probA")
    probB = train_results.get("probB")
    probE = train_results.get("probE")
    probV = train_results.get("probV")
    probClasses = [probA, probB, probE, probV]
    numWordsA = train_results.get("numWordsA")
    numWordsB = train_results.get("numWordsB")
    numWordsE = train_results.get("numWordsE")
    numWordsV = train_results.get("numWordsV")
    numWordsClasses = [numWordsA, numWordsB, numWordsE, numWordsV]

    # |V| containing number of distinct words
    numDistinctWords = len(train_results.get("wordDict"))
    
    classes = ["A", "B", "E", "V"]
    classPredictions = []
    
    # for each abstract
    for i in range(abstract.size):
        
        # get list of words in abstract
        wordList = abstract.iloc[i].split()
        
        classProb = []
        
        # calculate probability of abstract belonging to each class
        for c in range(4):
            prob = math.log(probClasses[c])
            for word in wordList:
                word = word.lower()
                if (word not in stopwordList and not word.isdigit()):
                    word = lemmatizer.lemmatize(word)
                    if (word in wordDict):
                        countWC = wordDict.get(word).get(classes[c])
                        probWC = math.log((countWC + 1)/(numWordsClasses[c] + numDistinctWords))
                        prob += probWC
            classProb.append(prob)
        
        maxProb = max(classProb)
        predictedClass = classes[classProb.index(maxProb)]
        classPredictions.append(predictedClass)
    
    return classPredictions
    
# test predicted values with expected values of abstracts
def test(abstract_test, target_test, train_results):
    
    # get class predictions for abstracts
    classPredictions = predict(abstract_test, train_results)
    
    # calculate accuracy based on number of correct predictions between predicted and expected
    correctPredictions = 0
    for i in range(abstract_test.size):
        if(classPredictions[i] == target_test.iloc[i]):
            correctPredictions += 1
            
    accuracy = correctPredictions/abstract_test.size
    return accuracy

# apply ten fold cross validation to evaluate accuracy of model
def ten_fold_cv(target, abstract):
    
    # set 10-fold cross validation
    kf = KFold(n_splits=10)
    # list of accuracies from 10-fold cross validation
    accuracies = []

    # for each fold
    for train_index, test_index in kf.split(target, abstract):
        
        # get training and test sets
        target_train, target_test = target.loc[train_index], target.loc[test_index]
        abstract_train, abstract_test = abstract.loc[train_index], abstract.loc[test_index]
        # train model
        train_results = train(abstract_train, target_train)
        # use training results to test model accuracy
        test_accuracy = test(abstract_test, target_test, train_results)
        accuracies.append(test_accuracy)

    # calculate average accuracy of the 10 folds
    avg_accuracy = sum(accuracies)/len(accuracies)*100
    return avg_accuracy
        
if __name__ == '__main__':
    # get training data
    target_train, abstract_train = import_train_data()
    # word lemmatizer and stop words list
    lemmatizer = WordNetLemmatizer()
    stopwordList = stopwords.words('english')
    
    # apply 10-fold cross validation to evaluate model accuracy
    tenFoldAccuracy = ten_fold_cv(target_train, abstract_train)
    print("Ten-fold Cross Validation Accuracy: " + str(round(tenFoldAccuracy,2)) + "%")
    
    train_results = train(abstract_train, target_train)
    abstract_predict = import_predict_data()
    classPredictions = predict(abstract_predict, train_results)
    
    with open('gng276.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["id", "class"])
        for i in range(len(classPredictions)):
            writer.writerow([i+1, classPredictions[i]])
    print("Finished writing to gng276.csv")

Ten-fold Cross Validation Accuracy: 94.95%
Finished writing to gng276.csv
