-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_loader.py
213 lines (162 loc) · 7.02 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import string
import re
import torch
import nltk
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm
from torchtext import vocab
from torch.utils.data import Dataset, DataLoader
# for progress bar during pandas ops
tqdm.pandas()
stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))
# -------------- Helper Functions --------------
def preprocess(text):
"Add spaces around / and #"
text = re.sub(r'([/#\n])', r' \1 ', text)
"Remove extra spaces"
text = re.sub(' {2,}', ' ', text)
"Removes any repeated characters > 2 to 2"
text = re.sub(r'(.)\1+', r'\1\1', text)
"Remove any numbers and words mixed within them"
text = re.sub(r'\w*\d\w*', '', text).strip()
"Remove 's -"
return text.replace("'s", "").replace("-", "")
def tokenize(text):
'''
:param text: a doc with multiple sentences, type: str
return a word list, type: list
https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
e.g.
Input: 'It is a nice day. I am happy.'
Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
'''
tokens = []
for word in nltk.casual_tokenize(text, preserve_case=False):
if word not in stop_words and not word.isnumeric():
tokens.append(word)
return tokens
def get_sequence(data, seq_length, vocab_dict):
'''
:param data: a list of words, type: list
:param seq_length: the length of sequences,, type: int
:param vocab_dict: a dict from words to indices, type: dict
return a dense sequence matrix whose elements are indices of words,
'''
data_matrix = np.zeros((len(data), seq_length), dtype=int)
for i, doc in enumerate(data):
for j, word in enumerate(doc):
# YOUR CODE HERE
if j == seq_length:
break
word_idx = vocab_dict.get(word, 1) # 1 means the unknown word
data_matrix[i, j] = word_idx
return data_matrix
def build_vocab(sentence_list, threshold, vocab=None):
"""
:param sentence_list: an iterable object with multiple words in each sub-list, type: iterable object
:param threshold: minimum number of a word's count to be included into the vocabulary object type: int
:param vocab: a Vocab object, type: object
:return: a dictionary from words to indices and indices to words
"""
counter = Counter()
for sentence in sentence_list:
counter.update(sentence)
# sort by most common
word_count = sorted(counter.items(), key=lambda x: x[1], reverse=True)
# exclude words that are below a frequency
words = [word for word, count in word_count if count > threshold]
if vocab is None:
vocab = Vocab()
vocab.add_word('<pad>') # 0 means the padding signal
vocab.add_word('<unk>') # 1 means the unknown word
# add the words to the vocab
for word in words:
vocab.add_word(word)
return vocab
class Vocab(object):
def __init__(self):
self.word2idx = dict()
self.vocab_size = 0
def __len__(self):
return len(self.word2idx)
def add_word(self, word):
if not word in self.word2idx:
self.word2idx[word] = self.vocab_size
self.vocab_size += 1
def get_embedding(self, name, embedding_dim):
if name == 'glove':
pretrained_type = vocab.GloVe(name='42B', dim=embedding_dim)
elif name == 'fasttext':
if embedding_dim != 300:
raise ValueError("Got embedding dim {}, expected size 300".format(embedding_dim))
pretrained_type = vocab.FastText('en')
embedding_len = len(self)
weights = np.zeros((embedding_len, embedding_dim))
words_found = 0
for word, index in self.word2idx.items():
try:
# torchtext.vocab.__getitem__ defaults key error to a zero vector
weights[index] = pretrained_type.vectors[pretrained_type.stoi[word]]
words_found += 1
except KeyError:
if index == 0:
continue
weights[index] = np.random.normal(scale=0.6, size=(embedding_dim))
print(embedding_len - words_found, "words missing from pretrained")
return torch.from_numpy(weights).float()
# ----------------- End of Helper Functions-----------------
class SentimentDataset(Dataset):
"""
Defines a dataset composed of sentiment text and labels
Attributes:
df (Dataframe): Dataframe of the CSV from teh path
vocab (dict{str: int}: A vocabulary dictionary from word to indices for this dataset
sample_weights(ndarray, shape(len(labels),)): An array with each sample_weight[i] as the weight of the ith sample
data (list[int, [int]]): The data in the set
"""
def __init__(self, path, fix_length, threshold, vocab=None):
df = pd.read_csv(path)
self.df = df
# preprocess
df["text"] = df["text"].progress_apply(preprocess)
# tokenize
df['words'] = df["text"].progress_apply(tokenize)
# take lengths of words, with fixed max length
df['lengths'] = df['words'].apply(lambda x: fix_length if len(x) > fix_length else len(x))
# filter out rows with lengths of 0
df = df.loc[df['lengths'] >= 1]
# build vocab
self.vocab = build_vocab(df['words'], threshold, vocab)
# change class indices to 0 - 4
labels = df["stars"].apply(int) - 1
# pad to fix length & numericalize
seqs = get_sequence(df['words'], fix_length, self.vocab.word2idx)
# compute sample weights from inverse class frequencies
class_sample_count = np.unique(labels, return_counts=True)[1]
weight = 1. / class_sample_count
self.samples_weight = torch.from_numpy(weight[labels])
self.data = list(zip(labels, seqs, df["lengths"]))
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]
def get_loader(fix_length, vocab_threshold, batch_size):
train_dataset = SentimentDataset("data/train.csv", fix_length, vocab_threshold)
vocab = train_dataset.vocab
valid_dataset = SentimentDataset("data/valid.csv", fix_length, vocab_threshold, vocab)
test_dataset = SentimentDataset("data/test.csv", fix_length, vocab_threshold, vocab)
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=4)
valid_dataloader = DataLoader(dataset=valid_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=4)
test_dataloader = DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=4)
return train_dataloader, valid_dataloader, test_dataloader, vocab