In [None]:
from gensim.models import Word2Vec, KeyedVectors
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 

In [None]:
import os
import xml.etree.ElementTree
import nltk
from nltk.tokenize import word_tokenize
from sklearn import svm
from numpy import array
import statistics

In [None]:
# load model with vocabulary consisting of top 400,000 word vectors
model=KeyedVectors.load_word2vec_format(r"\GoogleNews-vectors-negative300.bin",binary=True,limit=400000)

In [None]:
#defining tags for different Parts-Of-Speech
VERBS = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
ADJECTIVE = ['JJ', 'JJR', 'JJS']
NOUN=['NN','NNP','NNS']

In [None]:
# return tags for a sentence
def make_tags(sentence):
    tokenizer = RegexpTokenizer(r"\w+")
    tokenized = tokenizer.tokenize(sentence)
    return nltk.pos_tag(tokenized)

In [None]:
# return if a word is a verb
def is_verb(tup):
    for v in VERBS:
        if tup[1] == v:
            return v
    return False

# return if a word is a noun
def is_noun(tup):
    for n in NOUN:
        if tup[1] == n:
            return n
    return False

# return if a word is adjective
def is_adj(tup):
    for a in ADJECTIVE:
        if tup[1] == a:
            return a
    return False

In [None]:
# extract feature vector for schema

def extract_feature(sentence1, sentence2, candidate1, candidate2,c_1,c_2):

    sent1_tags = make_tags(sentence1)
    sent2_tags = make_tags(sentence2)
    tags=sent1_tags+sent2_tags
    
    li=[]
    li.extend(c_1.strip(" ")+c_2.strip(" "))
    
    sentence_tokens=[word for word in tags if word[0] not in li]

    sent_verb=[]
    sent_adj=[]
    sent_noun=[]
    
    for v in sentence_tokens[::-1]:
        if is_verb(v):
            sent_verb.append(v[0])
            
    for adj in sentence_tokens[::-1]:
        if is_adj(adj):
            sent_adj.append(adj[0])
            

    for v in sentence_tokens[::-1]:
        if is_noun(v):
            sent_noun.append(v[0])
           
    final_keywords=[]
    
    while(len(final_keywords)<2):
        
        if(len(sent_noun)>0):
            key_noun =  sent_noun[0]
            final_keywords.append(key_noun)
            sent_noun.pop(0)
        
        if(len(sent_verb)>0):
            key_verb  = sent_verb[0]
            final_keywords.append(key_verb)
            sent_verb.pop(0)
                
        if(len(final_keywords)>=2):
            break
            
        if(len(sent_adj)>0):
            key_adj=sent_adj[0]
            final_keywords.append(key_adj)
            sent_adj.pop(0)
            
    k_1=final_keywords[0]
    k_2=final_keywords[1]
  
    ftr1 = model.similarity(candidate1, k_1)

    ftr2 = model.similarity(candidate1, k_2)

    ftr3 = model.similarity(candidate2, k_1)
    
    ftr4 = model.similarity(candidate2, k_2)
    

    return [ftr1,ftr2,ftr3,ftr4]
    


In [None]:
if __name__ == '__main__':

    print("==== 80:20 TRAIN:VAL ...TRAIN=228 VAL=57 SCHEMAS====")

    sentences = []
    sent1 = []
    sent2 = []
    conjs = []
    prons = []
    answer = []
    
    candidates=[]
    
    candidate1=[]
    candidate2=[]

    
    filepath = "datasets\WSC-285.xml"
    

    xml_data = xml.etree.ElementTree.parse(filepath).getroot()

    size = 0

    for schema in xml_data.findall('schema'):

        sent1.append(schema[0][0].text.lower().strip())
        sent2.append(schema[0][2].text.lower().strip())
        prons.append(schema[0][1].text)
     
        mylist=[]
        for i in range(0,2):
            cand=schema[2][i].text.lower()
            cand_text_tokens = word_tokenize(cand)
            if "the" in cand_text_tokens:
                cand_text_tokens.remove("the")
            mylist.append(cand_text_tokens[0])
            
        candidates.append(mylist)
        
        c_1=schema[2][0].text.lower()
        c_2=schema[2][1].text.lower()
        candidate1.append(c_1)
        candidate2.append(c_2)

        ans = schema[3].text.strip()
        if ans == 'A':
            answer.append(1)
        else:
            answer.append(-1)

        sentences.append(sent1[size] + ' ' + prons[size] + ' ' + sent2[size])

        size += 1

    print("Start training the model...")

    total = len(sentences)
    train_size = 228
    train_feature = []
    train_target=[]
    
    train_count=0
    train_sentences_present=[]
    train_sentences_absent=[]

    
    for i in range(train_size):
        s1 = sent1[i]
        s2 = sent2[i]
        can1=candidates[i][0]
        can2=candidates[i][1]
        try:
            feature = extract_feature(s1, s2,candidates[i][0],candidates[i][1],candidate1[i],candidate2[i])
            train_target.append(answer[i])
            train_feature.append(feature)
            train_count+=1
            train_sentences_present.append(sentences[i])

        except:
            train_sentences_absent.append(sentences[i])
            
            

    clf = svm.SVC(gamma="scale")
    clf.fit(train_feature, train_target)



    test_size = total - train_size
    test_feature = []
    
    val_sentences_present=[]
    val_sentences_absent=[]
    val_count=0
    
    val_target_answers=[]

    for i in range(test_size):
        s1 = sent1[i + train_size]
        s2 = sent2[i + train_size]
        can1=candidates[i+train_size][0]
        can2=candidates[i+train_size][1]
        c_1=candidate1[i+train_size]
        c_2=candidate2[i+train_size]
        try:
            feature = extract_feature(s1, s2,can1,can2,c_1,c_2)
            val_target_answers.append(answer[i+train_size])
            test_feature.append(feature)
            val_count+=1
            val_sentences_present.append(sentences[i+train_size])
        except:
            val_sentences_absent.append(sentences[i+train_size])
            
    test_answer = clf.predict(test_feature)

    print("Start caculating accuracy...")
    
    cor=[]
    wrong=[]

    correct = 0
    for i in range(len(test_answer)):
        if test_answer[i] == val_target_answers[i]:
            correct += 1
            cor.append(in_vocab_val[i])
        else:
            wrong.append(in_vocab_val[i])
            
    print('Accuracy of SVM:', round(float(correct) / len(test_answer) * 100,3),  "%")
    
    
    
    
    print("\n------------------------------------------------------------------\n\n")