In [66]:
import nltk
import pandas as pd
import re

import torch
import torch.utils
import torch.utils.data

from wordExtractor import wordExtractor
from dictConfig import replaceDict

# 小工具

In [None]:
import time
def log(*k, **kw):
    timeStamp = time.strftime('[%y-%m-%d %H:%M:%S] ', time.localtime())
    print(timeStamp, end='')
    print(*k, **kw)
def debug(*k, stop=False, **kw):
    print('[DEBUG] ', end='')
    print(*k, **kw)
    if stop:
        raise SystemExit

In [None]:
def getPara(module, useString=True):
    para = sum([x.nelement() for x in module.parameters()])
    if not useString:
        return para
    elif para >= 2**20:
        return '{:.2f}M'.format(para / 2**20)
    elif para >= 2**10:
        return '{:.2f}K'.format(para / 2**10)
    else:
        return str(para)
def savemodel(name, model):
    torch.save(model.state_dict(), name + '.pt')
def loadmodel(name, model):
    model.load_state_dict(torch.load(name + '.pt'))

使用<>包夹表示特殊词，不作处理

# 文本处理

In [161]:
emojiDict = {
    ':)': '<smile>',
    ':D': '<smile>',
    ':.(': '<cry>',
    '-.-': '<emoji1>',
    '-__-': '<emoji2>',
    '>.>': '<emoji3>',
    'O.o': '<emoji4>',
    ':??': '<emoji4>'
}
chineseDict = {
    '\u2019': "'",
    '\u002c': ',',
}

In [4]:
extractor = wordExtractor()
extractor.addDict(replaceDict)

stemmer = nltk.stem.LancasterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stopSet = set(nltk.corpus.stopwords.words('english'))

In [5]:
def seperateSpecialElement(wordsTargetList):
    i = 0
    while i != len(wordsTargetList):
        word, isTarget = wordsTargetList[i]
        s = re.search('<\w*>', word)
        if s != None:
            wordsTargetList.pop(i)
            one = word[:s.start()]
            two = word[s.start():s.end()]
            three = word[s.end():]
            if len(one) != 0:
                wordsTargetList.insert(i, (one, isTarget))
                i += 1
            wordsTargetList.insert(i, (two, isTarget))
            i += 1
            if len(three) != 0:
                wordsTargetList.insert(i, (three, isTarget))
        else:
            i += 1
    return wordsTargetList

In [62]:
def removeFullStop(wordsTargetList):
    wordsTargetPureList = []
    for word, isTarget in wordsTargetList:
        newList = [(newW, isTarget) for newW in re.split('[.,?!]', word) if len(newW) != 0]
        wordsTargetPureList.extend(newList)
    return wordsTargetPureList

In [7]:
def lemmatize(word, pos):
    if pos == 'NN':
        return lemmatizer.lemmatize(word, pos='n')
    if pos == 'VB':
        return lemmatizer.lemmatize(word, pos='v')
    if pos == 'JJ':
        return lemmatizer.lemmatize(word, pos='a')
    if pos == 'R':
        return lemmatizer.lemmatize(word, pos='r')
    else:
        return word

In [303]:
# 转小写
# -> 展开缩写  
# -> 空格分割  
# -> 标记@网名 -> 标记#topic  
# -> 标记网址  
# -> 表情包:) :D :.(  
# -> 标记target  
# -> 删除".,"，断句  
# ? -> 标记无用词  
# ? -> 删除  
# x -> 转词根  
# x -> 词性标准 -> 词性还原  
# x -> 停词

# 返回处理后的文本和target
def processSentence(text, target):
    # 转小写
    lowerText = text.lower()
    # 移除中文字符
    for c in chineseDict:
        lowerText = lowerText.replace(c, chineseDict[c])
    # 展开缩写
    expendText = extractor.expand_contractions(lowerText)

    # 空格分割
    wordsList = expendText.split()
    # 标记@网名
    wordsList = [re.sub(r'@[\w_]*', '<user>', word) for word in wordsList]
    # 标记#topic
    wordsList = [re.sub(r'#\w*', '<topic>', word) for word in wordsList]
    # 标记网址
    wordsList = [re.sub(r'http(s)?://.*', '<web>', word) for word in wordsList]
    # 标记省略号
    wordsList = [re.sub(r'\.{3,}', '<ellipsis>', word) for word in wordsList]
    # 标记数字
    wordsList = [re.sub(r'\d+\.?\d*', '<number>', word) for word in wordsList]
    # 标记表情包
    for emoji in emojiDict:
        wordsList = [word.replace(emoji.lower(), emojiDict[emoji]) for word in wordsList]

    # 标记target
    wordsTargetList = [(word, target[0] <= i <= target[1]) for i, word in enumerate(wordsList)]
    # 分离特殊符号<***>
    wordsTargetList = seperateSpecialElement(wordsTargetList)
    # 删除".,"，断句
    wordsTargetList = removeFullStop(wordsTargetList)

    # -> 词性标准 -> 词性还原 
    # pureText, isTargets = zip(*wordsTargetList)
    # _, pos_tags = zip(*nltk.pos_tag(pureText))

    # oriText = [lemmatize(word, pos=pos) for word, pos in zip(pureText, pos_tags)]
    # oriText = [stemmer.stem(word) for word in oriText]

    # wordsTargetList = list(zip(oriText, isTargets))

    # 去除停用词
    # wordsTargetList = [(word, isTarget) for word, isTarget in wordsTargetList if word not in stopSet]
    
    # 加上结束符
    wordsTargetList.append(('<EOS>', False))
    finText, isTargets = zip(*wordsTargetList)

    # 恢复target
    beg = -1
    end = -1
    for i, isTarget in enumerate(isTargets):
        if isTarget and beg == -1:
            beg = i
        if not isTarget and beg != -1:
            end = i - 1
            break
            
    return finText, (beg, end)

## 测试

In [226]:
texts = [
    "@Mrf Don't hesitate  to ask questions. Be positive,Keep positive. Emm:D:DHMm.... they'll be selected",
    'Yes\u002c I intend to go everyweek\u002c but of course that\u2019s not confirmed.Hope you get well soon.May you be free from physical sufferin.',
    'Oh... I c i c... Haha\u002c xin ask u to take away food 4 her\u002c pau or smethg... Gee\u002c can help me buy packet milo? Thanx sweety...',
    '#Vote4UrDAYUMSelf Comedy Show w/ @Ali_Speaks TOMORROW @ the Houston Improv, 7:30 PM. FREE SHOW! Get... http://t.co/by9COiwf',
    "Gas by my house hit $3.39!!!! OK.I'm going to Chapel Hill on Sat. :)",
    "`` Thud '' means it is just a boring wordcount post .",
]
for text in texts:
    print(text)
    print(processSentence(text, (3, 5)))

@Mrf Don't hesitate  to ask questions. Be positive,Keep positive. Emm:D:DHMm.... they'll be selected
(('<user>', 'do', 'not', 'hesitate', 'to', 'ask', 'questions', 'be', 'positive', 'keep', 'positive', 'emm', '<smile>', '<smile>', 'hmm', '<ellipsis>', 'they', 'will', 'be', 'selected'), (3, 5))

Yes, I intend to go everyweek, but of course that’s not confirmed.Hope you get well soon.May you be free from physical sufferin.
(('yes', 'i', 'intend', 'to', 'go', 'everyweek', 'but', 'of', 'course', 'that', 'is', 'not', 'confirmed', 'hope', 'you', 'get', 'well', 'soon', 'may', 'you', 'be', 'free', 'from', 'physical', 'sufferin'), (3, 5))

Oh... I c i c... Haha, xin ask u to take away food 4 her, pau or smethg... Gee, can help me buy packet milo? Thanx sweety...
(('oh', '<ellipsis>', 'i', 'see', 'i', 'see', '<ellipsis>', 'hah', 'xin', 'ask', 'uou', 'to', 'take', 'away', 'food', '<number>', 'her', 'pau', 'or', 'smethg', '<ellipsis>', 'gee', 'can', 'help', 'me', 'buy', 'packet', 'milo', 'thanx', 

词性还原
https://blog.csdn.net/qq_16234613/article/details/79430381

# 定义数据集

In [387]:
class corpusData():
    def __init__(self, table, closeThreshold=10):
        self.threshold = closeThreshold
        self.corpus = self.loadCorpus(table)
        self.processedCorpus = self.processe(self.corpus)
        self.word2index, self.index2word = self.getDict(self.processedCorpus)
    def loadCorpus(self, table):
        corpus = []
        
        polarityClass = {
            'positive': 0,
            'neutral': 1,
            'negative': 2,
            'unknwn': 3
        }
        
        for _, line in table.iterrows():
            target = (line['index1'], line['index2'])
            polarity = polarityClass[line['polarity']]
            text = line['text']
            corpus.append((text, target, polarity))
        return corpus
    def getDict(self, corpus):
        fdist = nltk.probability.FreqDist()
        for words, _, _ in corpus:
            fdist.update(words)
        index2word = [word for word, freq in fdist.most_common() if freq > self.threshold]
        index2word.append('<unknown>')
        word2index = { word: i for i, word in enumerate(index2word) }
        return word2index, index2word
    def processe(self, corpus):
        processedCorpus = []
        for text, target, polarity in corpus:
            newText, newTarget = processSentence(text, target)
            processedCorpus.append((newText, newTarget, polarity))
        return processedCorpus
    def sentence2Tensor(self, sentence):
        change = lambda word: word if word in self.word2index else '<unknown>'
        return torch.tensor([self.word2index[change(word)] for word in sentence])
    def tensor2Sentence(self, tensor, join=True):
        if join:
            return ' '.join([self.index2word[i] for i in tensor])
        else:
            return [self.index2word[i] for i in tensor]
    def getRawText(self, index):
        return self.corpus[index][0]
    def getProcessedText(self, index, join=True):
        if join:
            return ' '.join(self.processedCorpus[index][0])
        else:
            return self.processedCorpus[index][0]
    def __getitem__(self, index):
        text = self.getProcessedText(index, join=False)
        _, target, polarity = self.processedCorpus[index]
        return self.sentence2Tensor(text), target, polarity
    def __len__(self):
        return len(self.processedCorpus)

# 迭代样例

In [376]:
def collate_fn_corpus(batch):
    with torch.no_grad():
        texts, targets, polaritys = zip(*batch)
        maxlen = max([len(text) for text in texts])
        batchTexts = [torch.nn.functional.pad(text, pad=(0, maxlen-len(text))) for text in texts]
        batchTexts = torch.stack(batchTexts)
    return batchTexts, torch.tensor(targets), torch.tensor(polaritys)

# 测试集、验证集的加载

In [388]:
table = pd.read_csv('target-sentiment-analysis/train.tsv', sep='\t', 
            names=['ID1','ID2','index1','index2', 'polarity','text'])

table.sample(frac=1)
TRAIN_SIZE = int(len(table)*0.9)
tableTrain = table[:TRAIN_SIZE]
tableValdate = table[TRAIN_SIZE:]

trainSet = corpusData(table=tableTrain, closeThreshold=0)
valSet = corpusData(table=tableValdate, closeThreshold=0)

In [392]:
trainloader = torch.utils.data.DataLoader(trainSet, batch_size=5, collate_fn=collate_fn_corpus, shuffle=True)
valloader = torch.utils.data.DataLoader(valSet, batch_size=5, collate_fn=collate_fn_corpus, shuffle=True)

In [390]:
len(valSet)

1040

In [393]:
for texts, targets, polarity in valloader:
    print(texts.shape)

torch.Size([5, 29])
torch.Size([5, 22])
torch.Size([5, 40])
torch.Size([5, 28])
torch.Size([5, 29])
torch.Size([5, 28])
torch.Size([5, 28])
torch.Size([5, 19])
torch.Size([5, 30])
torch.Size([5, 19])
torch.Size([5, 31])
torch.Size([5, 23])
torch.Size([5, 44])
torch.Size([5, 28])
torch.Size([5, 32])
torch.Size([5, 30])
torch.Size([5, 28])
torch.Size([5, 29])
torch.Size([5, 29])
torch.Size([5, 30])
torch.Size([5, 30])
torch.Size([5, 20])
torch.Size([5, 31])
torch.Size([5, 24])
torch.Size([5, 26])
torch.Size([5, 24])
torch.Size([5, 31])
torch.Size([5, 24])
torch.Size([5, 29])
torch.Size([5, 24])
torch.Size([5, 32])
torch.Size([5, 22])
torch.Size([5, 25])
torch.Size([5, 26])
torch.Size([5, 32])
torch.Size([5, 30])
torch.Size([5, 33])
torch.Size([5, 21])
torch.Size([5, 27])
torch.Size([5, 30])
torch.Size([5, 31])
torch.Size([5, 28])
torch.Size([5, 24])
torch.Size([5, 75])
torch.Size([5, 27])
torch.Size([5, 30])
torch.Size([5, 31])
torch.Size([5, 33])
torch.Size([5, 29])
torch.Size([5, 32])


# 训练函数

In [None]:
def train(model, loss_func, optimizer, trainloader, device):
    """
    train model using loss_fn and optimizer in an epoch.
    model: CNN networks
    train_loader: a Dataloader object with training data
    loss_func: loss function
    device: train on cpu or gpu device
    """
    model.train()
    
    trainAccuracy = 0
    trainLoss = 0
    total = 0
    
    for i, (images, targets) in enumerate(trainloader):
        images, targets = images.to(device), targets.to(device)

        # forward
        outputs = model(images)
        
        loss = loss_func(outputs, targets)
        trainLoss += loss.item()

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # AdamW - https://zhuanlan.zhihu.com/p/38945390
        for group in optimizer.param_groups:
            for param in group['params']:
                param.data = param.data.add(-weight_decay * group['lr'], param.data)

        # return the maximum value of each row of the input tensor in the 
        # given dimension dim, the second return vale is the index location
        # of each maxium value found(argmax)
        _, predicted = torch.max(outputs.data, dim=1)
        trainAccuracy += (predicted == targets).sum().item()
        
        total += len(images)
    trainAccuracy /= total
    trainLoss /= total
    return trainLoss, trainAccuracy

In [None]:
def validate(model, lossFunction, validateloader, device):
    # evaluate the model
    model.eval()
    # context-manager that disabled gradient computation
    with torch.no_grad():
        # =============================================================
        valAccuracy = 0
        valLoss = 0
        total = 0
        
        for i, (images, targets) in enumerate(validateloader):
            images, targets = images.to(device), targets.to(device)
            
            outputs = model(images)
            
            loss = lossFunction(outputs, targets)
            valLoss += loss.item()
            
            # return the maximum value of each row of the input tensor in the 
            # given dimension dim, the second return vale is the index location
            # of each maxium value found(argmax)
            _, predicted = torch.max(outputs.data, dim=1)
            valAccuracy += (predicted == targets).sum().item()
            
            total += len(images)
        valAccuracy /= total
        valLoss /= total
    return valLoss, valAccuracy

In [None]:
def showCurve(list_trainLoss, list_trainAccuracy, list_valLoss, list_valAccuracy):
    xAxis = list(range(len(list_trainLoss)))
    fig, axs = plt.subplots(1, 2)

    axs[0].plot(xAxis, list_trainLoss, label='train')
    axs[0].plot(xAxis, list_valLoss, label='validation')
    axs[0].set_title('Loss')

    axs[1].plot(xAxis, list_trainAccuracy, label='train')
    axs[1].plot(xAxis, list_valAccuracy, label='validation')
    axs[1].set_title('Accuracy')

    for ax in axs:
        ax.axis()
        ax.set_xlabel('epoch')
        ax.set_ylabel('{}'.format(ax.get_title()))
        ax.legend()
    fig.set_size_inches((8, 4))
    plt.subplots_adjust(wspace=0.3)
    plt.show()

# 训练

In [None]:
model = ...
model = model.to(device)

lossFunction = nn.CrossEntropyLoss()
lr = 0.01
num_epoches = 500
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

scheduler = lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.5)

results_train = []
results_val = []

log('开始训练')
for epoch in range(num_epoches):
    res_train = train(model, loss_func, optimizer, trainloader, device)
    res_val = validate(model, loss_func, trainloader, validateloader, device)
    
    (trainLoss, trainAccuracy) = res_train
    (valLoss, valAccuracy) = res_val
    results_train.append(res_train)
    results_val.append(res_val)
    log('[{:2d}/{}] Loss (Train: {:.6f}, Validation: {:.6f})     Accuracy (Train: {:.4f}, Validation: {:.4f})'
              .format(epoch+1, num_epoches, trainLoss, valLoss, trainAccuracy, valAccuracy))
    
    scheduler.step()

showCurve(*zip(*results_train), *zip(*results_test))