In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import svm
from numpy import array
import statistics
import csv
from gensim.summarization import keywords
import requests

In [None]:
nltk.download('stopwords')

In [None]:
# creating a stopwords list

all_stopwords = stopwords.words('english')
all_stopwords.extend(['participant','nom_pronoun','occupation','$','acc_pronoun','poss_pronoun'])

In [None]:
# loading WinoGender dataset

wGender=open('datasets\WinoGender.tsv')
wg=csv.reader(wGender,delimiter="\t")

In [None]:
# returns relatedness value between two words - word1 & word2

def relatedness(word1,word2):
    link='http://api.conceptnet.io/relatedness?node1=/c/en/' + word1 + '&node2=/c/en/' + word2
    relatedness_obj=requests.get(link).json()
    val=relatedness_obj['value']
    
    return val


In [None]:
# extract feature vector for a sentence 
# each component is the relatedness value between 

def extract_feature(sentence_tokens,occupation,participant):
    
    #print(sentence_tokens)
    k1=sentence_tokens[0]
    k2=sentence_tokens[1]
        
    ftr1 = relatedness(occupation, k1)

    ftr2 = relatedness(occupation, k2)

    ftr3 = relatedness(participant, k1)
    
    ftr4 = relatedness(participant, k2)

    return [ftr1,ftr2,ftr3,ftr4]


In [None]:
# main + read input data
if __name__ == '__main__':    
    #list of all occupations
    occupations=[]

    #list of all participants
    participants=[]

    #tokenized_sentences
    sentence_tokens=[]

    #target answers
    answers=[]

    #whole sentences
    sentences=[]

    # for every schema in dataset extract occupation, participant, true answer and keywords
    for row in wg:
        text=row[3].lower()
        text_tokens=word_tokenize(text)

        tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
        my_sentence=" ".join(tokens_without_sw)

        # extract two keywords from schema
        ans=keywords(my_sentence,words=2,split=False,lemmatize = True)
        li = ans.split(" ")
       

        req=1
        if(len(li))==1:
            if '' in li:
                req=0
            else:
                for ele in li:
                    if "\n" in ele:
                        li=ele.split("\n")

        if(len(li))==1:
            reqd=0

        if len(li)==2:
            sentence_tokens.append(li)
            occupations.append(row[0])
            participants.append(row[1])
            if(row[2]=='1'):
                answers.append(1)
            else:
                answers.append(-1)

            text=text.replace("$occupation",row[0].lower())
            text=text.replace("$participant",row[1].lower())
            sentences.append(text)


    occupations=occupations[1:]
    participants=participants[1:]
    answers=answers[1:]
    sentence_tokens=sentence_tokens[1:]
    sentences=sentences[1:]

    print("==== WINOGENDER SCHEMAS -> TRAIN: 88 VAL : 32 ====")
    
    
    print("Start training the model...")

    total = len(sentences)
    train_size = 88
    train_feature = []
    train_target=[]
    
    train_count=0
    train_sentences_present=[]
    train_sentences_absent=[]
    
    for i in range(train_size):
        sent_tokens=sentence_tokens[i]
        occ=occupations[i]
        part=participants[i]
        try:
            feature = extract_feature(sent_tokens,occ,part)
            train_target.append(answers[i])
            train_feature.append(feature)
            train_count+=1
            train_sentences_present.append(sentences[i])

        except:
            train_sentences_absent.append(sentences[i])
            
            

    clf = svm.SVC(gamma="scale")
    clf.fit(train_feature, train_target)



    test_size = total - train_size
    test_feature = []
    
    val_sentences_present=[]
    val_sentences_absent=[]
    val_count=0
    
    val_target_answers=[]
   
    for i in range(test_size):
        sent_tokens=sentence_tokens[i+train_size]
        occ=occupations[i+train_size]
        part=participants[i+train_size]
        try:
            feature = extract_feature(sent_tokens,occ,part)
            val_target_answers.append(answers[i+train_size])
            test_feature.append(feature)
            val_count+=1
            val_sentences_present.append(sentences[i+train_size])
           
        except:
            val_sentences_absent.append(sentences[i+train_size])
        
            
    test_answer = clf.predict(test_feature)
    
            
    val_correct_sent=[]
    val_wrong_sent=[]

    print("Start caculating accuracy...")

    correct = 0
    for i in range(len(test_answer)):
        if test_answer[i] == val_target_answers[i]:
            correct += 1
            val_correct_sent.append(val_sentences_present[i])
        else:
            val_wrong_sent.append(val_sentences_present[i])

    print('Accuracy of SVM:', round(float(correct) / len(test_answer) * 100,3),  "%")
    
    
    
    
    print("\n------------------------------------------------------------------\n\n")
    

    