In [1]:
import io
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import torch
import torchvision
import torch.nn as nn
import itertools
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import pkuseg
import re
import matplotlib.pyplot as plt

In [1]:
!pwd

/opt/jupyter/mengxuan


In [2]:
def show_info(label, title):
    if not isinstance(label, np.ndarray):
        label = np.array(label)
    print('==='+title+'===')
    try:
        print('===data info===\nnumber of record: %s\npositive: %s\nnegative: %s\nneutral: %s\n' %(len(label), sum(label=='positive'), sum(label=='negative'), sum(label=='neutral')))
    except: 
        try:
            print('===data info===\nnumber of record: %s\npositive: %s\nnegative: %s\nneutral: %s\n' %(len(label), sum(label==1), sum(label==2), sum(label==0)))
        except: print('data info not available')

In [3]:
def read_data(directory, label_col='Polarity', demo = False, exclude_long = False, max_length = 500):
    data = pd.read_excel(open(directory, 'rb'))
    data = data.dropna(subset=[label_col])
    if exclude_long:
        data = data[data['Content'].map(len) < max_length]
    content = np.array(data['Content'])
    label = np.array(data[label_col])
    assert content.shape == label.shape
    show_info(label, 'original data')
    if demo:
        print(data[['Content', label_col]].head())
    return content, label

In [4]:
def tokenize(content):
    seg = pkuseg.pkuseg()                                  
    tokens = [seg.cut(record) for record in content]
    print('finish tokenizing all data\n')
    return tokens

In [5]:
def tokenize_with_model(content, model='/home/mengxuan/honda_data/ctb8', exclude_non_chinese=True):
    print('using ctb8 tokenizer')
    seg = pkuseg.pkuseg(model_name=model, user_dict=None)
    if exclude_non_chinese:
        tokens = [[t for t in seg.cut(record) if re.findall(u'[\u4e00-\u9fff]+', t)] for record in content]
    else:
        tokens = [seg.cut(record) for record in content]
    print('finish tokenizing all data\n')
    return tokens

In [6]:
def len_control(tokens, label, threshold=300):
    return zip(*[(tokens[i], label[i]) for i in range(len(tokens)) if len(tokens[i])<threshold])

In [7]:
#c1,_ = read_data(dataDir[0])
#c2,_ = read_data(dataDir[1])
#c = np.concatenate((c1, c2))
#c.shape

In [8]:
#t = tokenize_with_model(c)

In [9]:
#t = len_control(t)

In [10]:
#len(t)

In [11]:
#a = np.array([len(t[i]) for i in range(len(t))])

#%matplotlib inline
#x = np.random.normal(size = 1000)
#plt.hist(a, normed=True, bins=60)
#plt.xlim(0, 1000)  
#plt.ylabel('Probability');
#plt.show()

In [12]:
#sum(a>800)

In [13]:
def trained_word2vec(directory):
    with open(directory, encoding='utf8') as infile:
        content = infile.readlines()
    
    # metadata: number of words in the file / the dimension size
    meta = content[0].split(' ')
    print('===pre-trained word vec info===\n','number of words:', meta[0], 'dimension:', meta[1])
    
    word2vec = [vec.split(' ')[:-1] for vec in content[1:]]
    assert len(word2vec[0][1:]) == int(meta[1])
    
    word2vec_dic = {word2vec[i][0]:np.array([float(n) for n in word2vec[i][1:]], dtype = 'float32') for i in range(len(word2vec))}
    print('word vec model is ready\n')
    return word2vec_dic

In [14]:
def max_len(tokens):
    return max([len(sent) for sent in tokens])

In [15]:
def representation(contentTokens, word2vecDict, recordLen, vecDim=300):
    print('preparing for sentence matrix representation\n')
    doc = np.zeros((len(contentTokens), recordLen, vecDim), dtype = 'float32')
    for i in range(len(contentTokens)):
        for j in range(len(contentTokens[i])):
            try:
                doc[i,j,:] += word2vecDict[contentTokens[i][j]]
            except: pass
    print('document representation is ready')
    return doc

In [16]:
def representation_not_sparse(contentTokens, word2vecDict, recordLen, vecDim=300):
    print('preparing for sentence matrix representation\n')
    doc = np.zeros((len(contentTokens), recordLen, vecDim), dtype = 'float32')
    for i in range(len(contentTokens)):
        k = 0
        for j in range(len(contentTokens[i])):
            try:
                doc[i][k] += word2vecDict[contentTokens[i][j]]
            except: pass
            else: k += 1
        #print(doc.shape)
    print('document representation is ready')
    return doc

In [17]:
def label_adapt(label, labelDict):
    print('preparing for labels')
    return np.array([labelDict[k] for k in label])

In [18]:
def train_dev_split(X, y, ratio=[0.9, 0.1], Shuffle=True):
    if sum(ratio) != 1:
        raise ValueError('Invalid train/dev split ratio')
    if Shuffle:
        X, y = shuffle(X, y, random_state=0)
    split = int(len(X) * ratio[0])
    X_train, X_dev = X[:split], X[split:] 
    y_train, y_dev = y[:split], y[split:]

    return X_train, X_dev, y_train, y_dev

In [19]:
def main(dataDir, word2vecDir, labelDict):
    content1, label1 = read_data(dataDir[0])
    #content2, label2 = read_data(dataDir[1])
    tokens1, label1 = len_control(tokenize_with_model(content1), label1)
    #tokens2, label2 = len_control(tokenize_with_model(content2), label2)
    
    weibo_model = trained_word2vec(word2vecDir)
    
    #width = max(max_len(tokens1), max_len(tokens2))
    width = max_len(tokens1)
    doc2vec1 = representation_not_sparse(tokens1, weibo_model, width)
    #doc2vec2 = representation(tokens2, weibo_model, width)
    
    labelDigit1 = label_adapt(label1, labelDict)
    #labelDigit2 = label_adapt(label2, labelDict)
    #assert doc2vec.shape[0] == labelDigit.shape[0]
    
    X_train, X_dev, y_train, y_dev = train_dev_split(doc2vec1, labelDigit1)
    #X_train = np.concatenate((X_train1, doc2vec2))
    #y_train = np.concatenate((y_train1, labelDigit2))
    
    print('data is ready for classifier')
    print('training set shape:', X_train.shape, 'training labels:', y_train.shape)
    show_info(y_train, 'training')
    print('dev set shape:', X_dev.shape, 'dev labels:', y_dev.shape)
    show_info(y_dev, 'dev')
    return X_train, X_dev, y_train, y_dev

In [20]:
# 108 holdout
dataDir = ['/home/mengxuan/honda_data/2019-02-04_CH_Honda_MasterTraining8811.xlsx',
           '/home/mengxuan/honda_data/Chinese_Legacy_Training_data.xlsx',
          '/home/mengxuan/honda_data/CH_Honda_HoldOut_108.xlsx']

# 515 holdout
#dataDir = ['/home/mengxuan/honda_data/2019-02-04_CH_Honda_MasterTraining8811.xlsx',
#           '/home/mengxuan/honda_data/NLPCC Chinese Sentiment.xlsx',
#          '/home/mengxuan/honda_data/2019-01-08_MasterHoldouts515.xlsx']

# 821 holdout
#dataDir = ['/home/mengxuan/honda_data/2019-02-04_CH_Honda_MasterTraining8811.xlsx',
#           '/home/mengxuan/honda_data/NLPCC Chinese Sentiment.xlsx',
#          '/home/mengxuan/honda_data/2019-01-08_MasterHoldout821.xlsx']
word2vecDir = '/home/mengxuan/honda_data/cgns.weibo.word'

# local drive
#dataDir = ['C:\\Users\\Mengxuan Zhao\\Google Drive\\Clients & Partners\\Honda\\POC Data\\Chinese\\TrainingSets\\2019-02-04_CH_Honda_MasterTraining8811.xlsx',
#           'C:\\Users\\Mengxuan Zhao\\Google Drive\\Clients & Partners\\Honda\\POC Data\\Chinese\\HoldOuts\\CH_Honda_HoldOut_108.xlsx']
#word2vecDir = 'sgns.weibo.word'
labelDict = {'positive':1, 'negative':2, 'neutral':0}

In [21]:
if __name__ == "__main__":
    X_train, X_dev,y_train, y_dev = main(dataDir, word2vecDir, labelDict)

===original data===
===data info===
number of record: 8809
positive: 3411
negative: 1137
neutral: 4261

using ctb8 tokenizer
finish tokenizing all data

===pre-trained word vec info===
 number of words: 195202 dimension: 300

word vec model is ready

preparing for sentence matrix representation

document representation is ready
preparing for labels
data is ready for classifier
training set shape: (7928, 234, 300) training labels: (7928,)
===training===
===data info===
number of record: 7928
positive: 3078
negative: 1022
neutral: 3828

dev set shape: (881, 234, 300) dev labels: (881,)
===dev===
===data info===
number of record: 881
positive: 333
negative: 115
neutral: 433



  


In [22]:
#a = np.zeros((1,300))
#b = np.random.rand(500, 300)

#for i in range(b.shape[0]):
#    a = np.concatenate((a, b[i].reshape(1,300)))
#a[1:].shape

In [23]:
X_train_tensor = torch.from_numpy(X_train).float().view(X_train.shape[0], 1, X_train.shape[1], X_train.shape[2])
X_dev_tensor = torch.from_numpy(X_dev).float().view(X_dev.shape[0], 1, X_dev.shape[1], X_dev.shape[2])
#X_test_tensor = torch.from_numpy(X_test).float().view(X_test.shape[0], 1, X_test.shape[1], X_test.shape[2])

y_train_tensor = torch.from_numpy(y_train).long()
y_dev_tensor = torch.from_numpy(y_dev).long()
#y_test_tensor = torch.from_numpy(y_test).long()

In [24]:
def make_batch(data, batch_size):
    #assert len(data)%batch_size == 0
    for i in range(0, len(data), batch_size):
        if len(data) - i > batch_size:
            yield data[i:i+batch_size]
        else: yield data[i:]

In [25]:
batch_size = 50

X_train_b = tuple(make_batch(X_train_tensor, batch_size))
X_dev_b = tuple(make_batch(X_dev_tensor, batch_size))
#X_test_b = tuple(make_batch(X_test_tensor, batch_size))
y_train_b = tuple(make_batch(y_train_tensor, batch_size))
y_dev_b = tuple(make_batch(y_dev_tensor, batch_size))
#y_test_b = tuple(make_batch(y_test_tensor, batch_size))

In [26]:
def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.convfilter2 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=(2,300), stride=1), 
            nn.Tanh())
            #nn.MaxPool2d(kernel_size=(256,1)),
            #nn.Dropout(p=0.5))
        self.convfilter3 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=(3,300), stride=1), 
            nn.Tanh())
            #nn.MaxPool2d(kernel_size=(255,1)),
            #nn.Dropout(p=0.5))
        self.convfilter4 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=(4,300), stride=1), 
            nn.Tanh())
            #nn.MaxPool2d(kernel_size=(254,1)),
            #nn.Dropout(p=0.5))
        self.convfilter5 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=(5,300), stride=1), 
            nn.Tanh())
            #nn.MaxPool2d(kernel_size=(253,1)),
            #nn.Dropout(p=0.5))
        self.linearlayer1 = nn.Sequential(
            nn.Linear(400, 100),
            nn.Tanh())
        self.linearlayer2 = nn.Sequential(
            nn.Linear(100, 3))
        self.drop_out_conv = nn.Dropout(p=0.4)
        self.drop_out = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()
        #self.softmax = nn.Softmax(dim=1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.tanh = nn.Tanh()

    def forward(self, x):
        map2 = self.convfilter2(x)
        #print(map2.shape)
        map2 = kmax_pooling(map2, 2, 10)
        #print(map2.shape)
        map2 = map2.reshape(map2.size(0), -1)
        #print(map2.shape)
        map2 = self.drop_out_conv(map2)
        
        map3 = self.convfilter3(x)
        #print(map3.shape)
        map3 = kmax_pooling(map3, 2, 10)
        #print(map3.shape)
        map3 = map3.reshape(map3.size(0), -1)
        #print(map3.shape)
        map3 = self.drop_out_conv(map3)
        
        map4 = self.convfilter4(x)
        map4 = kmax_pooling(map4, 2, 10)
        map4 = map4.reshape(map4.size(0), -1)
        map4 = self.drop_out_conv(map4)
        
        map5 = self.convfilter5(x)
        map5 = kmax_pooling(map5, 2, 10)
        map5 = map5.reshape(map5.size(0), -1)
        map5 = self.drop_out_conv(map5)
        
        out = torch.cat((map2, map3),1)
        out = torch.cat((out, map4),1)
        
        out = torch.cat((out, map5),1)
        
        out = out.reshape(out.size(0), -1)
        out = self.drop_out(out)
        
        out = self.linearlayer1(out)
        out = self.drop_out(out)
        out = self.linearlayer2(out)
        out = self.softmax(out)
        return out

In [27]:
gpu = torch.cuda.is_available()
if gpu:
    print('working on GPU')

model = CNN()
if gpu:
    model.cuda()

num_epochs = 100
learning_rate = 0.004

#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [28]:
for i in range(num_epochs):
    correct = 0
    loss_sum = 0
    for j, (x, y) in enumerate(zip(X_train_b, y_train_b)):
        if gpu:
            x, y = x.cuda(), y.cuda()
        outputs = model(x)
        loss = criterion(outputs, y)
        
        # Backprop and perform Adam optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Track the accuracy
        if gpu:
        else:
            _, pred = outputs.data.max(1)
        
        #print('output:', outputs.data, 'prediction:', pred)
        correct += (pred == y).sum().item()
        loss_sum += loss.item()
        
        if (j+1) == (len(X_train_b)) and (i+1)%10 == 0:
            print('Cost:',loss_sum,'Accuracy for epoch %s:' %(i+1), correct / X_train.shape[0])
            print('Epoch %s done' %(i+1))
            precision = precision_score(y, pred, average='macro')
            recall = recall_score(y, pred, average='macro')
            f1 = f1_score(y, pred, average='macro')
            print('training set precision:', precision)
            print('training set recall:', recall)
            print('training set F1 score:', f1)
            print('\n')

            model.eval()
            with torch.no_grad():
                correct_dev = 0
                pred_dev = []
                for x, y in zip(X_dev_b, y_dev_b):
                    if gpu:
                        x, y = x.cuda(), y.cuda()
                    outputs = model(x)
                    if gpu:
                        pred = outputs.data.max(1)[1].cuda()
                    else:
                        _, pred = outputs.data.max(1)
                    correct_dev += (pred == y).sum().item()
                    pred_dev.append(pred)
                pred_dev = list(itertools.chain.from_iterable(pred_dev))
                pred_dev = np.array([i.cpu().item() for i in pred_dev])

                precision_dev = precision_score(y_dev, pred_dev, average='macro')
                recall_dev = recall_score(y_dev, pred_dev, average='macro')
                f1_dev = f1_score(y_dev, pred_dev, average='macro')
                print('Dev Accuracy:' , correct_dev / X_dev.shape[0])
                print('development set precision:', precision_dev)
                print('development set recall:', recall_dev)
                print('development set F1 score:', f1_dev)
                print('========')

Cost: 155.52579540014267 Accuracy for epoch 10: 0.4989909182643794
Epoch 10 done
training set precision: 0.3055555555555555
training set recall: 0.3283208020050125
training set F1 score: 0.3086680761099366




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Dev Accuracy: 0.52894438138479
development set precision: 0.3943292993925905
development set recall: 0.37283935205413266
development set F1 score: 0.31668055148438984
Cost: 150.52234250307083 Accuracy for epoch 20: 0.5504540867810293
Epoch 20 done
training set precision: 0.38095238095238093
training set recall: 0.406015037593985
training set F1 score: 0.39285714285714285


Dev Accuracy: 0.5402951191827469
development set precision: 0.3634638658367912
development set recall: 0.3974135798616168
development set F1 score: 0.3681213353206625
Cost: 146.7323083281517 Accuracy for epoch 30: 0.5678607467204844
Epoch 30 done
training set precision: 0.4090909090909091
training set recall: 0.42355889724310775
training set F1 score: 0.4140087554721701


Dev Accuracy: 0.5584562996594779
development set precision: 0.37977119965936296
development set recall: 0.4115801713954139
development set F1 score: 0.382555429067057
Cost: 142.6153183579445 Accuracy for epoch 40: 0.5848890010090817
Epoch 40 done
tr

window size: [2, 3, 4, 5]
stride: 1
channel: 10
dropout: [conv: 0.4, linear: 0.5]
learning rate: 0.004
epoch: 100
k for k-max pooling: 10
linear layer: 2

