In [1]:
"""
从完整数据集中选择一部分作为子集，分割train，val，test
"""
import numpy as np
import collections
import pandas as pd
import re

from argparse import Namespace

args = Namespace(
    raw_train_dataset_csv="data/yelp/raw_train.csv",
    raw_test_dataset_csv="data/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,# 子集所占比例
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

#读取train数据集
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])
#rating 和 review分别为读取的两列内容创建的名称

by_rating = collections.defaultdict(list) #dict类,value类型是list
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    # row.to_dict()将其转化为字典｛rating：1， review：xxx｝

review_subset = [] # TODO 子集的作用？
for _, item_list in sorted(by_rating.items()): #两次for循环，第一次读取#rating为1的评论个数
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])
review_subset = pd.DataFrame(review_subset)


# 划分train，val，test。
# 随机打乱，前0:n_train个训练，[n_train:n_train+n_val]做val集,[n_train+n_val:n_train+n_val+n_test]做test
by_rating = collections.defaultdict(list)
for _,row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

final_list =[]
np.random.seed(args.seed)
for _,item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)

    for item in item_list[:n_train]:#给每个数据后加入train/val/test
        item['split'] = 'train' #'rating': 1, 'review': 'xxx.', 'split': 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'
    final_list.extend(item_list)
final_reviews = pd.DataFrame(final_list)
print(final_reviews.split.value_counts()) #显示train，val，test数目

# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text) #在标点符号两边加空格
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)#将不是字母和标点的字符用空格替换
    return text

final_reviews.review = final_reviews.review.apply(preprocess_text)
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)
#将rating的1 2替换为neg和pos

final_reviews.to_csv(args.output_munged_csv, index=False)#保存文件


train    39200
val       8400
test      8400
Name: split, dtype: int64


In [2]:
final_reviews.head()#展示前几条数据

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [4]:
"""分类模型的构建和数据集处理"""
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

# Data Vectorization classes

## The Vocabulary 

In [5]:
class Vocabulary(object):
    """word<->indexs"""

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary unk指的是不存在于字典中的字符
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        # -1表示UNK未添加到Vocabulary
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def add_token(self, token):
        # 增加token，更新token_to_idx,idx_to_token
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        # 返回token的index,若token不存在，返回UNK index
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)  # dict.get()
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def to_serializable(self): # 用于保存Vocabulary
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}

    def __str__(self): # TODO:
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)


    @classmethod
    def from_serializable(cls, contents):
        """
        instantiates the Vocabulary from a serialized dictionary
        :param contents:
        :return:
        """
        return cls(**contents)
        #**表示传入的参数为dict形式


## The Vectorizer

In [6]:

class ReviewVectorizer(object):
    """ 将word转为向量"""
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        """return a collapsed one-hot vector representation for this review
        Args:
            review (str): the review
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding
        """
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, review_df, cutoff= 25):
        #将出现次数ctoff以上的word，添加到V中
        """
        :param review_df: (pandas.DataFrame格式) the review dataset
        :param cutoff: word出现次数的限制，比如选出现20次以上的单词
        :return:ReivewVectorizer的对象
        """
        review_vocab = Vocabulary(add_unk=True)#创建Vocabulary对象
        rating_vocab = Vocabulary(add_unk=False)

        #加入rating的token，即postive和negtive
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        #统计review的每个word出现的次数,将>cutoff的加入token
        word_counts = Counter()# ｛word：count｝
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count> cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab,rating_vocab)

    @classmethod
    def from_serializable(cls,contents):
        """从一个序列化字典中实例化ReviewVectorizer
        Instantiate a ReviewVectorizer from a serializable dictionary
        :param contents(dict):
        :return:  an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        # contents是sict，用于实例化Vocabulary
        return cls(review_vocab=review_vocab,rating_vocab=rating_vocab)

    def to_serializable(self):
        """
        Create the serializable dictionary for caching
        :return: contents (dict): the serializable dictionary
        """
        return {'review_vocab': self.review_vocab.to_serializable(),
                'rating_vocab': self.rating_vocab.to_serializable()}


## The Dataset

In [7]:

"""
继承自Dataset类，需重写__len__ 使得len(dataset)返回数据集的大小；
和 __getitem__ 使得dataset[i]能够返回第i个数据样本这样的下标操作。
"""

class ReviewDataset(Dataset):  # 继承自Dataset类
    def __init__(self, review_df, vectorizer):
        """
        :param review_df: (pandas.DataFrame), the dataset
        :param vectorizer: (ReviewVectorizer), vectorizer instantiated from dataset
        """
        self.review_df = review_df  # review 数据集
        self._vectorizer = vectorizer  # ReviewVectorizer类的对象

        self.train_df = self.review_df[self.review_df.split == 'train']
        self.train_size = len(self.train_df)
        self.val_df = self.review_df[self.review_df.split == 'val']
        self.validation_size = len(self.val_df)
        self.test_df = self.review_df[self.review_df.split == 'test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train') #选择train，val还是test

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """
        the primary entry point method for PyTorch datasets
        :param index:  the index to the data point
        :return: a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self.target_df.iloc[index]
        review_vector = self._vectorizer.vectorize(row.review)
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)

        return {'x_data': review_vector, 'y_target':rating_index}

    def get_num_batches(self,batch_size):
        return len(self) // batch_size #//向下取整

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """
        Load train dataset and make a new vectorizer from scratch
        :param review_csv: location of dataset
        :return:an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split=='train']
        return cls(review_df=review_df, vectorizer=ReviewVectorizer.from_dataframe(train_review_df,args.frequency_cutoff) )

    @classmethod
    def load_dataset_and_load_vectorizer(cls,review_csv,vectorizer_filepath):
        """
        Load dataset and the corresponding vectorizer.
        Used in the case in the vectorizer has been cached for re-use
        :param review_csv(str): location of the dataset
        :param vectorizer_filepath(str):location of the saved vectorizer
        :return: an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        vectorizer = cls.load_vecotrizer_only(vectorizer_filepath)
        return cls(review_df, vectorizer)

    @staticmethod
    def load_vecotrizer_only(vectorizer_filepath):
        """
        a static method for loading the vectorizer from file
        :param vectorizer_filepath(str):
        :return: an instance of ReviewVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return ReviewVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    

## The Model: ReviewClassifier

In [8]:

"""
应用sigmoid再使用BceLoss(torch.nn.BCELoss())可能存在数值稳定性问题。
针对此问题，Pytorch提供BCEWithLogitsLoss()，要使用此函数，输出不能使用sigmoid函数
"""
class ReviewClassifier(nn.Module):
    def __init__(self, num_features):
        """
        :param num_features: the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fcl = nn.Linear(in_features=num_features, out_features=1)
        #使用感知器线性模型Wx+b
        # TODO: in_feature是dim？即V的大小，onehot的长度

    def forward(self, x_in, apply_sigmoid = False):
        """The forward pass of the classifier
        Args:
            x_in (torch.Tensor): an input data tensor.
                x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                should be false if used with the Cross Entropy losses
        Returns: the resulting tensor. tensor.shape should be (batch,)
        """
        y_out = self.fcl(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out


# Training Routine

## helpfer funcitons

In [9]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
        """
        """
        dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                                shuffle=shuffle, drop_last=drop_last)
        for data_dict in dataloader:
            #data_dict为｛x_data:batch_size个样本的collapsed one hot表示，y_target：batch_size个样本的label｝
            out_data_dict = {}
            for name, tensor in data_dict.items():
                #name是x_data,tensor是batch_size个样本的collapsed one hot表示
                #name是y_target，tensor是batch_size个样本的label(即0和1)
                out_data_dict[name] = data_dict[name].to(device)
            yield out_data_dict
        #TODO: for循环的作用？

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,  #效果变差已经持续的epoch数
            'early_stopping_best_val': 1e8, #目前最好的loss
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.
    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better
    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns: a new train_state
    """
    # Save one model at least 保存初始模型
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False
    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
        #选择最近两次的epoch结果
        # If loss worsened 效果变差
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        else: # Loss decreased 效果变好
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
            # Reset early stopping step
            train_state['early_stopping_step'] = 0
        # stop early?
        train_state['stop_early']=train_state['early_stopping_step']>= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    #sigmoid（ypred）>0.5的为1类，<0.5为0类，再统计与target有相同label的
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    #item()指的是将tensor（[10]）转化为标量10
    return n_correct/len(y_pred_indices)*100


def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)



## Settings and some prep work

In [10]:

args = Namespace(
    # Data and Path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='data/yelp/reviews_with_splits_lite.csv',
    save_dir='model_storage/ch3/yelp/',  # 不需要提前创建，handle_dirs()会创建
    vectorizer_file='vectorizer.json',
    # No Model hyper parameters
    # Training hyper parameters
    batch_size=128,
    early_stopping_criteria=5,#可以忍受的效果变差持续的epoch数
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    catch_keyboard_interrupt=True,  # TODO: ?
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,  # 是否从文件中加载Vectorizer
)
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
# check cuda
if not torch.cuda.is_available():
    args.cuda = False
print("Using CUDA: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)


Expanded filepaths: 
	model_storage/ch3/yelp/vectorizer.json
	model_storage/ch3/yelp/model.pth
Using CUDA: True


## Intialization

In [11]:
# dataset and  vectorizer
if args.reload_from_files:
    print("Loading dataset and vectorizer")
    dataset = ReviewDataset.load_dataset_and_load_vectorizer(args.review_csv, args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer")
    dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# model
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))


Loading dataset and creating vectorizer


## Training loop

In [12]:
classifier = classifier.to(args.device)

# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
# 调整学习率

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='trainging routine', total=args.num_epochs, position=0)
dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train', total=dataset.get_num_batches(args.batch_size),
                          position=1, leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val', total=dataset.get_num_batches(args.batch_size),
                        position=1, leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index
        print("epoch = ",epoch_index)

        # 在train集迭代
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.0 #每个batch的平均loss
        running_acc = 0.0#每个batch的平均acc
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. zero the gradients
            optimizer.zero_grad()
            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'].float())
            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            # TODO: xdata和ytarget是什么数据？
            #x_data不应该是collapsed one-hot吗，为什么没用到vectorizer?
            #答：ReviewDataset的__getitem__函数，将DataFrame格式的vectorize了
            #ytarget是每个样本的label(0,1),pytorch会自动转化为one-hot表示
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            # step 4. use loss to produce gradients
            loss.backward()
            # step 5. use optimizer to take gradient step
            optimizer.step()

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t -running_acc) / (batch_index + 1)

             # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # 在val集迭代
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'].float())
            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier, train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")


HBox(children=(IntProgress(value=0, description='trainging routine', style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='split=train', max=306, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='split=val', max=65, style=ProgressStyle(description_width='in…

epoch =  0
epoch =  1
epoch =  2
epoch =  3
epoch =  4
epoch =  5
epoch =  6
epoch =  7
epoch =  8
epoch =  9
epoch =  10
epoch =  11
epoch =  12
epoch =  13
epoch =  14
epoch =  15
epoch =  16
epoch =  17
epoch =  18
epoch =  19
epoch =  20
epoch =  21
epoch =  22
epoch =  23
epoch =  24
epoch =  25
epoch =  26
epoch =  27
epoch =  28
epoch =  29
epoch =  30
epoch =  31
epoch =  32
epoch =  33
epoch =  34
epoch =  35
epoch =  36
epoch =  37
epoch =  38
epoch =  39
epoch =  40
epoch =  41
epoch =  42
epoch =  43
epoch =  44
epoch =  45
epoch =  46
epoch =  47
epoch =  48
epoch =  49
epoch =  50
epoch =  51
epoch =  52
epoch =  53
epoch =  54
epoch =  55
epoch =  56
epoch =  57
epoch =  58
epoch =  59
epoch =  60
epoch =  61
epoch =  62
epoch =  63
epoch =  64
epoch =  65
epoch =  66
epoch =  67
epoch =  68
epoch =  69
epoch =  70
epoch =  71
epoch =  72
epoch =  73
epoch =  74
epoch =  75
epoch =  76
epoch =  77
epoch =  78
epoch =  79
epoch =  80
epoch =  81
epoch =  82
epoch =  83
ep

# Test

In [13]:
classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                   batch_size=args.batch_size,
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # step 2. compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    # step 3. compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))


Test loss: 0.137
Test Accuracy: 95.11


## Interface rating of a review

In [16]:

"""Ingerence rating"""
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
    """
    Predict the rating of a review
    :param review(str): the text of the review
    :param classifier(ReviewClassifier): the trained model
    :param vectorizer(ReviewVectorizer): the corresponding vectorizer
    :param decision_threshold(float): The numerical boundary which separates the rating classes
    :return:
    """
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result = classifier(vectorized_review.view(1, -1))#reshape成1行n列
    probability_value = torch.sigmoid(result).item() #计算sigmoid
    index = 1
    if probability_value < decision_threshold:
        index = 0
    return vectorizer.rating_vocab.lookup_index(index)

In [17]:
test_review = "this is a pretty awesome book"

classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))

this is a pretty awesome book -> positive


In [20]:
classifier.fcl.weight.shape #7326是V的大小

torch.Size([1, 7326])

In [26]:
# Sort weights
fcl_weights = classifier.fcl.weight.detach()[0]
i, indices = torch.sort(fcl_weights, dim=0, descending=True)
# torch.sort() 返回排序后元素和其初始index
print(i)
print(indices)

indices = indices.numpy().tolist()

# Top 10 words
print("Influential words in Positive Reviews: Top 10")
print("--------------------------------------")
for i in range(10):
    print(vectorizer.review_vocab.lookup_index(indices[i]))
    
print("====\n\n\n")

# Top 10 negative words
print("Influential words in Negative Reviews: Top 10")
print("--------------------------------------")
indices.reverse()
for i in range(10):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

tensor([ 1.6036,  1.4505,  1.4117,  ..., -1.7685, -1.8328, -2.0215])
tensor([2406, 1098, 7237,  ...,  890,  186,  408])
Influential words in Positive Reviews: Top 10
--------------------------------------
delicious
fantastic
pleasantly
amazing
great
vegas
yum
excellent
perfect
awesome
====



Influential words in Negative Reviews: Top 10
--------------------------------------
worst
mediocre
bland
horrible
meh
awful
rude
terrible
tasteless
overpriced
