In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import svm
from numpy import array
import statistics
import csv
from gensim.summarization import keywords
import requests
from gensim.models import Word2Vec, KeyedVectors

In [None]:
nltk.download('stopwords')

In [None]:
# load model with vocabulary consisting of top 400,000 word vectors
model=KeyedVectors.load_word2vec_format(r"\GoogleNews-vectors-negative300.bin",binary=True,limit=400000)

In [None]:
# creating a stopwords list

all_stopwords = stopwords.words('english')
all_stopwords.extend(['participant','nom_pronoun','occupation','$','acc_pronoun','poss_pronoun'])

In [None]:
# loading WinoGender dataset

wGender=open('datasets\WinoGender.tsv')
wg=csv.reader(wGender,delimiter="\t")

In [None]:
#defining tags for different Parts-Of-Speech
VERBS = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
ADJECTIVE = ['JJ', 'JJR', 'JJS']
NOUN=['NN','NNP','NNS']

In [None]:
# return POS tagged words in sentence

def make_tags(tokenized):
    return nltk.pos_tag(tokenized)


In [None]:
#check if word is a verb
def is_verb(tup):
    for v in VERBS:
        if tup[1] == v:
            return v
    return False

#check if word is a noun
def is_noun(tup):
    for n in NOUN:
        if tup[1] == n:
            return n
    return False

#check if word is an adjective
def is_adj(tup):
    for a in ADJECTIVE:
        if tup[1] == a:
            return a
    return False

In [None]:
# extract feature vector for schema

def extract_feature(sentence1, sentence2, candidate1, candidate2,c_1,c_2):

    sent1_tags = make_tags(sentence1)
    sent2_tags = make_tags(sentence2)
    tags=sent1_tags+sent2_tags
    
    li=[]
    li.extend(c_1.strip(" ")+c_2.strip(" "))
    
    sentence_tokens=[word for word in tags if word[0] not in li]

    sent_verb=[]
    sent_adj=[]
    sent_noun=[]
    
    for v in sentence_tokens[::-1]:
        if is_verb(v):
            sent_verb.append(v[0])
            
    for adj in sentence_tokens[::-1]:
        if is_adj(adj):
            sent_adj.append(adj[0])
            

    for v in sentence_tokens[::-1]:
        if is_noun(v):
            sent_noun.append(v[0])
           
    final_keywords=[]
    
    while(len(final_keywords)<2):
        
        if(len(sent_noun)>0):
            key_noun =  sent_noun[0]
            final_keywords.append(key_noun)
            sent_noun.pop(0)
        
        if(len(sent_verb)>0):
            key_verb  = sent_verb[0]
            final_keywords.append(key_verb)
            sent_verb.pop(0)
                
        if(len(final_keywords)>=2):
            break
            
        if(len(sent_adj)>0):
            key_adj=sent_adj[0]
            final_keywords.append(key_adj)
            sent_adj.pop(0)
            
    k_1=final_keywords[0]
    k_2=final_keywords[1]
  
    ftr1 = model.similarity(candidate1, k_1)

    ftr2 = model.similarity(candidate1, k_2)

    ftr3 = model.similarity(candidate2, k_1)
    
    ftr4 = model.similarity(candidate2, k_2)
    

    return [ftr1,ftr2,ftr3,ftr4]
    


In [None]:
if __name__ == '__main__':
    #list of all occupations
    occupations=[]

    #list of all participants
    participants=[]

    #tokenized_sentences
    sentence_tokens=[]

    #target answers
    answers=[]

    #whole sentences
    sentences=[]
    
    # for every schema in dataset extract occupation, participant, true answer and keywords
    for row in wg:
        text=row[3].lower()
        text_tokens = word_tokenize(text)
        tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
        tokens=make_tags(tokens_without_sw)
        sentence_tokens.append(tokens)
        occupations.append(row[0])
        participants.append(row[1])
        if(row[2]=='1'):
            answers.append(1)
        else:
            answers.append(-1)

        text=text.replace("$occupation",row[0].lower())
        text=text.replace("$participant",row[1].lower())
        sentences.append(text)

    occupations=occupations[1:]
    participants=participants[1:]
    answers=answers[1:]
    sentence_tokens=sentence_tokens[1:]
    sentences=sentences[1:]
    
    
    print("==== WINOGENDER SCHEMAS : 120 TOTAL ... TRAIN : 96 VAL : 24====")
    
    
    print("Start training the model...")

    total = len(sentences)
    train_size = 96
    train_feature = []
    train_target=[]
    
    train_count=0
    train_sentences_present=[]
    train_sentences_absent=[]
    
    
    for i in range(train_size):
        sent_tokens=sentence_tokens[i]
        occ=occupations[i]
        part=participants[i]
        try:
            feature = extract_feature(sent_tokens,occ,part)
            train_target.append(answers[i])
            train_feature.append(feature)
            train_count+=1
            train_sentences_present.append(sentences[i])

        except:
            train_sentences_absent.append(sentences[i])
            
    #SVM classifier        
    clf = svm.SVC(gamma="scale")
    clf.fit(train_feature, train_target)



    test_size = total - train_size
    test_feature = []
    
    val_sentences_present=[]
    val_sentences_absent=[]
    val_count=0
    
    val_target_answers=[]
                
    for i in range(test_size):
        sent_tokens=sentence_tokens[i+train_size]
        occ=occupations[i+train_size]
        part=participants[i+train_size]
        try:
            feature = extract_feature(sent_tokens,occ,part)
            val_target_answers.append(answers[i+train_size])
            test_feature.append(feature)
            val_count+=1
            val_sentences_present.append(sentences[i+train_size])

        except:
            val_sentences_absent.append(sentences[i+train_size])
        
    test_answer = clf.predict(test_feature)

    print("Start caculating accuracy...")
    
    val_correct_sent=[]
    val_wrong_sent=[]

    correct = 0
    for i in range(len(test_answer)):
        if test_answer[i] == val_target_answers[i]:
            correct += 1
            val_correct_sent.append(val_sentences_present[i])
        else:
            val_wrong_sent.append(val_sentences_present[i])

    print('Accuracy of SVM:', round(float(correct) / len(test_answer) * 100,3),  "%")
    
    
    
    
    print("\n------------------------------------------------------------------\n\n")
    

        


    