### 作業目的: 熟練Pytorch Dataset與DataLoader進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eating\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eating\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 探索資料與資料前處理
在train資料中，有分成pos(positive)與neg(negative)，分別為正評價與負評價，此評價即為label。

In [2]:
import os
import re
from nltk.corpus import stopwords

stoplist = set(stopwords.words("english"))

def rm_tags(test):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', test)

# 讀取字典，這份字典為review內所有出現的字詞
pos_path = "aclImdb_v1/aclImdb_v1/aclImdb/train/pos/"
neg_path = "aclImdb_v1/aclImdb_v1/aclImdb/train/neg/"
file_list = []
vocab = []

for f in os.listdir(pos_path):
    file_list += [pos_path+f]
    
for f in os.listdir(neg_path):
    file_list += [neg_path+f]

print(len(file_list))

for fi in file_list:
    with open(fi, encoding='utf8') as file_input:
        s = rm_tags(file_input.readlines()[0]).lower() # downcase
        tokens = nltk.tokenize.word_tokenize(s) # 字串變單字 (tokens)
        tokens = [t for t in tokens if len(t) > 5] # 移除短字
        tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 只取英文基本型
        for t in tokens:
            if t not in vocab:
                vocab.append(t)

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")
vocab = [v for v in vocab if v not in stoplist] # 移除 stopwords
vocab = [v for v in vocab if not any(c.isdigit() for c in v)] # 移除數字，比方 "3rd edition"
print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
count = 0
vocab_dic = {}
for v in vocab:
    vocab_dic[v] = count
    count += 1

vocab_dic

25000
vocab length before removing stopwords: 108471
vocab length after removing stopwords: 105852


{'bromwell': 0,
 'cartoon': 1,
 'comedy': 2,
 'program': 3,
 'school': 4,
 'teacher': 5,
 'teaching': 6,
 'profession': 7,
 'believe': 8,
 'satire': 9,
 'closer': 10,
 'reality': 11,
 'scramble': 12,
 'survive': 13,
 'financially': 14,
 'insightful': 15,
 'student': 16,
 'pathetic': 17,
 'pettiness': 18,
 'situation': 19,
 'remind': 20,
 'episode': 21,
 'repeatedly': 22,
 'immediately': 23,
 'recalled': 24,
 'classic': 25,
 'inspector': 26,
 'welcome': 27,
 'expect': 28,
 'adult': 29,
 'fetched': 30,
 'homelessness': 31,
 'houselessness': 32,
 'george': 33,
 'carlin': 34,
 'stated': 35,
 'street': 36,
 'considered': 37,
 'everything': 38,
 'matter': 39,
 'people': 40,
 'homeless': 41,
 'worrying': 42,
 'thing': 43,
 'racism': 44,
 'pressuring': 45,
 'succeed': 46,
 'technology': 47,
 'election': 48,
 'inflation': 49,
 'streets.but': 50,
 'without': 51,
 'luxury': 52,
 'entertainment': 53,
 'bathroom': 54,
 'picture': 55,
 'computer': 56,
 'treasure': 57,
 'goddard': 58,
 'lesson.mel': 

In [3]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度
review_pairs = []

for fi in file_list:
    if 'pos' in fi:
        review_pairs.append((fi, 1))
    else:
        review_pairs.append((fi, 0))
###<your code>###
print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('aclImdb_v1/aclImdb_v1/aclImdb/train/pos/0_9.txt', 1), ('aclImdb_v1/aclImdb_v1/aclImdb/train/pos/10000_8.txt', 1)]
Total reviews: 25000


### 建立Dataset與DataLoader讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量BoW的函式
(generate_bow)

In [4]:
def load_review(review_path):
    review = []
    with open(review_path, encoding='utf8') as file_input:
        s = rm_tags(file_input.readlines()[0]).lower() # downcase
        review = nltk.tokenize.word_tokenize(s) # 字串變單字 (tokens)
        review = [t for t in review if len(t) > 5] # 移除短字
        review = [wordnet_lemmatizer.lemmatize(t) for t in review] # 只取英文基本型
        review = [v for v in review if v not in stoplist] # 移除 stopwords
        review = [v for v in review if not any(c.isdigit() for c in v)] # 移除數字，比方 "3rd edition"

    return review

In [5]:
def generate_bow(review, vocab_dic):
    bag_vector = np.zeros(len(vocab_dic))
    for word in review:
        if vocab_dic.get(word):
            bag_vector[vocab_dic.get(word)] += 1
            
    return bag_vector

In [6]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_dirs, vocab):
        self.x = data[0]
        self.y = data[1]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = torch.tensor(self.x[idx])
        y = torch.tensor(self.y[idx])
        
        return x, y

In [None]:
# 建立客製化dataset
review_list = []

for rev in review_pairs:
    review_list.append((generate_bow(load_review(rev[0]), vocab_dic), rev[1]))

custom_dst = dataset(review_list)
custom_dst[10]

In [None]:
# 建立dataloader
custom_dataloader = DataLoader(custom_dst, batch_size=5, shuffle=True)
next(iter(custom_dataloader))