# CS 224N Final Project - Evaluating on WinoDict Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in GPT-2 Model
Using HuggingFace Transformers

In [2]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("weights/G2G-Finetuned-2-Epochs-T")
model = GPT2LMHeadModel.from_pretrained("weights/G2G-Finetuned-2-Epochs")

## Load in Winograd Dataset
Also taken from HuggingFace

In [3]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [4]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        # Change 'the' to lowercase
        first_choice, second_choice = example['option1'], example['option2']
        if (first_choice[:4] == "The "):
            first_choice = "the " + first_choice[4:]
        if (second_choice[:4] == "The "):
            second_choice = "the " + second_choice[4:]

        # Replace the text
        first_text, second_text = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_text[:pronoun_loc] + first_choice + first_text[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_text[:pronoun_loc] + second_choice + second_text[pronoun_loc + 1:]

        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")

        # Create the first token labels
        first_masked_tokens = tokenizer(example['definition'] + " " + first_text[:pronoun_loc] + first_choice, return_tensors="pt")
        first_labels = first_masked_tokens["input_ids"][0]
        first_mask = torch.full((1, first_labels.shape[0]), -100)
        first_fill = tokenizer(first_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_first_labels = torch.cat((first_mask, first_fill), dim=1)

        # Create the second token labels
        second_masked_tokens = tokenizer(example['definition'] + " " + second_text[:pronoun_loc] + second_choice, return_tensors="pt")
        second_labels = second_masked_tokens["input_ids"][0]
        second_mask = torch.full((1, second_labels.shape[0]), -100)
        second_fill = tokenizer(second_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_second_labels = torch.cat((second_mask, second_fill), dim=1)

        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            return (int(example['label']) == 0)
        else:
            return (int(example['label']) == 1)

## Evaluating WinoDict on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [5]:
correct, total = 0, 0
for index, row in first_set.iterrows():
    if (row['lemma'] != "lemma"):
        total += 1
        correct += evaluate_winodict(row)
        print(correct)
    
print("GPT-2 Medium achieved a score of: " + str(float(correct) / float(total)))

0
1
2
3
3
4
4
5
5
6
6
7
7
8
8
8
8
8
9
9
10
10
11
12
13
13
14
14
14
14
15
16
17
18
18
19
19
20
21
22
23
23
23
24
24
24
25
25
25
26
26
27
28
28
28
29
29
29
30
31
32
33
33
33
33
34
34
35
36
37
38
38
38
38
39
40
41
41
41
42
42
43
44
44
44
45
45
46
47
47
48
48
48
49
50
51
51
52
53
53
53
54
55
55
56
57
57
57
57
58
59
60
60
60
61
61
62
62
62
63
63
64
65
66
67
68
69
69
70
70
70
71
72
72
72
72
73
73
74
74
75
75
76
76
77
77
77
77
77
77
78
79
79
80
80
81
81
81
82
83
84
85
86
86
87
88
89
89
89
90
90
91
91
92
93
94
95
96
97
98
99
100
101
102
102
103
103
104
105
105
106
106
107
108
109
110
111
111
112
112
113
113
114
114
115
115
115
116
117
117
117
118
119
119
119
120
120
120
121
122
122
123
123
123
123
123
123
123
124
124
125
125
126
126
127
128
129
130
130
130
130
130
130
130
130
131
131
131
132
132
132
133
134
134
134
135
136
136
136
136
137
137
138
138
138
138
138
138
139
139
140
141
141
141
141
141
142
142
143
144
144
145
146
146
147
147
147
148
148
148
149
149
149
149
149
149
149
149
150
150
1