In [None]:
# !wget -O en-zh.tsv "https://tatoeba.org/zh-cn/exports/download/48735/%E6%9C%89%E8%8B%B1%E8%AF%AD-%E4%B8%AD%E6%96%87%E6%99%AE%E9%80%9A%E8%AF%9D%E5%AF%B9%E5%BA%94%E5%8F%A5%20-%202024-06-11.tsv"

In [None]:
import torch
import numpy as np
import pandas as pd

#将繁体转成简体
!pip install zhconv
from zhconv import convert

def Q2B(uchar):
  """判断一个unicode是否是全角数字"""
  if uchar >= u'\uff10' and uchar <= u'\uff19':
    """单个字符 全角转半角"""
    inside_code = ord(uchar)
    if inside_code == 0x3000:
        inside_code = 0x0020
    else:
        inside_code -= 0xfee0
    if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半角字符返回原来的字符
        return uchar
    return chr(inside_code)
  else:
    return uchar

def stringpartQ2B(ustring):
  return "".join([Q2B(uchar) for uchar in ustring])


def convertSimple(x):
  return stringpartQ2B(convert(x.values[0], 'zh-cn'))


all_data = pd.read_csv('en-zh.tsv',sep='\t',on_bad_lines='skip',names=['NO.1','en','NO.2','zh'])

zh_data = all_data.iloc[:,[3]].apply(convertSimple, axis=1).rename('zhs',inplace=True)

all_data = pd.concat([all_data.iloc[:,[1]], zh_data], axis=1)

all_data = all_data.applymap(lambda x: x.lower()) #英文全部转为小写

print(type(all_data))
print(type(all_data.values))
all_data

In [None]:
!pip install nltk #英文分词
!pip install jieba #中文分词

import nltk
nltk.download('punkt')
nltk.word_tokenize("today is june 18th and it is muiriel's birthday!")

import jieba
jieba.lcut("今天是6月18号，也是muiriel的生日！")

en_list = all_data.iloc[:,[0]].applymap(lambda x: nltk.word_tokenize(x))
zhs_list = all_data.iloc[:,[1]].applymap(lambda x: jieba.lcut(x))
all_data = pd.concat([en_list,zhs_list],axis=1)

# 添加两个特殊符号 <BOF> <EOF> 分别表示 句子的开始和结束
all_data = all_data.applymap(lambda x: ['<BOF>']+ x +['<EOF>'])

all_data

统计单词的频率，降序排列，用单词的下标作为单词的id

In [6]:
from collections import Counter

#统计单词的频率
en_table = Counter([])
zhs_table = Counter([])

for row in all_data.values:
  en_table.update(row[0])
  zhs_table.update(row[1])

en_to_id = {"<PAD>": 0, "<UNK>": 1,} # UNK表示未知字符，PAD表示占位符
zhs_to_id = {"<PAD>": 0, "<UNK>": 1}

id_to_en = {0: '<PAD>', 1:'<UNK>'}
id_to_zhs = {0: '<PAD>', 1:'<UNK>'}

#用数组下标作为单词的id，因为 0: '<PAD>', 1:'<UNK>' 所以id从2开始
for idx,ele in enumerate(en_table.most_common(50000), start=2):
  en_to_id[ele[0]] = idx
  id_to_en[idx] = ele[0]

print(list(en_to_id.items())[:20])
print(list(id_to_en.items())[:20])
print("=========")

for idx,ele in enumerate(zhs_table.most_common(50000), start=2):
  zhs_to_id[ele[0]] = idx
  id_to_zhs[idx] = ele[0]

print(list(zhs_to_id.items())[:20])
print(list(id_to_zhs.items())[:20])

en_id_len = len(list(en_to_id.items()))
zhs_id_len = len(list(zhs_to_id.items()))
print("=========")
print('英文词典长度', en_id_len)
print('中文词典长度', zhs_id_len)

[('<PAD>', 0), ('<UNK>', 1), ('<BOF>', 2), ('<EOF>', 3), ('.', 4), ('the', 5), ('i', 6), ('to', 7), ('you', 8), ('is', 9), ('a', 10), ('?', 11), (',', 12), ('he', 13), ("n't", 14), ('in', 15), ('do', 16), ('of', 17), ('it', 18), ("'s", 19)]
[(0, '<PAD>'), (1, '<UNK>'), (2, '<BOF>'), (3, '<EOF>'), (4, '.'), (5, 'the'), (6, 'i'), (7, 'to'), (8, 'you'), (9, 'is'), (10, 'a'), (11, '?'), (12, ','), (13, 'he'), (14, "n't"), (15, 'in'), (16, 'do'), (17, 'of'), (18, 'it'), (19, "'s")]
[('<PAD>', 0), ('<UNK>', 1), ('<BOF>', 2), ('<EOF>', 3), ('。', 4), ('我', 5), ('的', 6), ('了', 7), ('你', 8), ('他', 9), ('，', 10), ('？', 11), ('是', 12), ('在', 13), ('她', 14), ('汤姆', 15), ('吗', 16), ('我们', 17), ('不', 18), ('很', 19)]
[(0, '<PAD>'), (1, '<UNK>'), (2, '<BOF>'), (3, '<EOF>'), (4, '。'), (5, '我'), (6, '的'), (7, '了'), (8, '你'), (9, '他'), (10, '，'), (11, '？'), (12, '是'), (13, '在'), (14, '她'), (15, '汤姆'), (16, '吗'), (17, '我们'), (18, '不'), (19, '很')]
英文词典长度 14738
中文词典长度 25706


In [7]:
# 把翻译数据全部转成id的表示形式
all_en_id = all_data.iloc[:,[0]].applymap(lambda x: [en_to_id.get(word, en_to_id['<UNK>']) for word in x]) 
all_zhs_id = all_data.iloc[:,[1]].applymap(lambda x: [zhs_to_id.get(word, zhs_to_id['<UNK>']) for word in x]) 
all_data = pd.concat([all_en_id, all_zhs_id], axis=1)
all_data

  all_en_id = all_data.iloc[:,[0]].applymap(lambda x: [en_to_id.get(word, 0) for word in x]) # PAD=0
  all_zhs_id = all_data.iloc[:,[1]].applymap(lambda x: [zhs_to_id.get(word, 0) for word in x]) # PAD=0


Unnamed: 0,en,zhs
0,"[2, 6, 24, 7, 50, 7, 337, 4, 3]","[2, 5, 160, 22, 244, 7, 4, 3]"
1,"[2, 119, 9, 1703, 5554, 31, 18, 9, 2983, 19, 5...","[2, 76, 12, 1487, 377, 2541, 1027, 10, 48, 12,..."
2,"[2, 2983, 9, 1287, 95, 4, 3]","[2, 3548, 66, 1537, 580, 7, 4, 3]"
3,"[2, 5, 2172, 9, 135, 2983, 137, 4, 3]","[2, 2176, 12, 307, 3548, 307, 4, 3]"
4,"[2, 6, 48, 38, 140, 211, 4, 3]","[2, 5, 253, 39, 28, 166, 4, 3]"
...,...,...
68673,"[2, 6, 42, 147, 29, 1150, 21, 23, 167, 19, 932...","[2, 276, 445, 123, 25, 5, 1591, 6, 2871, 8913,..."
68674,"[2, 8, 42, 202, 22, 111, 4, 3]","[2, 8, 51, 73, 9208, 4, 3]"
68675,"[2, 8, 160, 202, 22, 111, 4, 3]","[2, 8, 51, 73, 9208, 4, 3]"
68676,"[2, 26, 363, 64, 5, 799, 12, 75, 26, 66, 14, 5...","[2, 14, 118, 1508, 10, 565, 132, 70, 225, 4, 3]"
