In [15]:
import pandas as pd
import tiktoken

In [21]:
# Check the data in the reviews.csv
df = pd.read_csv('reviews.csv')
print(df.head())

                review  sentiment
0        期待期待，上映一定要去看。          1
1              真的很萌啊哈哈          1
2       衣服好华丽啊!王子能再帅点嘛          1
3  为啥结局要那样!!!为啥为啥为啥!!!          1
4               表示看不懂~          0


In [3]:
reviews = df['review']
# Check the average, max, and min length of the reviews
lengths = [len(review) for review in reviews]
print('Average length:', sum(lengths) / len(lengths))
print('Max length:', max(lengths))
print('Min length:', min(lengths))

Average length: 21.9052
Max length: 171
Min length: 1


In [6]:
# 看一下sentiment的分布
print(df['sentiment'].value_counts())

1    4305
0     695
Name: sentiment, dtype: int64


In [None]:
# padding之前总共有差不多109526个token

In [5]:
# write the reviews to a txt file
# if the sentiment is 1, add "好评" to the beginning of the review
# if the sentiment is 0, add "差评" to the beginning of the review
with open('reviews.txt', 'w') as f:
    for i, review in enumerate(reviews):
        if df['sentiment'][i] == 1:
            f.write('好评：' + review + '\n')
        else:
            f.write('差评：' + review + '\n')

In [22]:
# Try to tokenize the reviews with transoformers
from transformers import AutoTokenizer
token_ckpt = "./tokenizer"
tokenizer = AutoTokenizer.from_pretrained(token_ckpt)

def tokenize_text(sequence):
    """Tokenize input sequence."""
    return tokenizer(sequence, padding=True, truncation=True, max_length=256)

# add propmt before each review base on the sentiment
def add_prompt(row):
    if row["sentiment"] == 1:
        return "好评：" + row["review"]
    else:
        return "差评：" + row["review"]

df["review"] = df.apply(add_prompt, axis=1)
# df["review"] = df["review"].apply(padding)

tok = df['review'].map(tokenize_text)
tok_df = pd.DataFrame(list(tok))

def padding(list):
    if(len(list) < 20):
        list.extend([0] * (20 - len(list)))
    return list

tok_df = tok_df.applymap(padding)



In [23]:
print(tok_df.head())
from statistics import mean, stdev

_len = [len(sample) for sample in tok_df['input_ids']]
avg_len, std_len = mean(_len), stdev(_len)
min_len, max_len = min(_len), max(_len)

print('-'*10 + ' Corpus statistics ' + '-'*10)
print(f'\nAvg. length: {avg_len:.1f} (std. {std_len:.1f})')
print('Min. length:', min_len)
print('Max. length:', max_len)

                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4                  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]   

                                           input_ids  \
0  [101, 1962, 6397, 8038, 3309, 2521, 3309, 2521...   
1  [101, 1962, 6397, 8038, 4696, 4638, 2523, 5846...   
2  [101, 1962, 6397, 8038, 6132, 3302, 1962, 1290...   
3  [101, 1962, 6397, 8038, 711, 1567, 5310, 2229,...   
4  [101, 2345, 6397, 8038, 6134, 4850, 4692, 679,...   

                                      token_type_ids  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [24]:
# decode one of the tokenized reviews
print(tokenizer.decode(tok_df['input_ids'][0]))

[CLS] 好 评 ： 期 待 期 待 ， 上 映 一 定 要 去 看 。 [SEP]
