In [81]:
import jieba
import jieba.analyse
from os import listdir
from os.path import isfile, join
import re
import pickle
from textblob import TextBlob as tb
import math
import numpy as np
import operator
import gensim
from hanziconv import HanziConv
from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.merge import Concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D 
from keras.layers.pooling import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import Adam

def setExternelDicts():
    jieba.set_dictionary('./external/dict.txt.big')
    jieba.analyse.set_stop_words('./external/stop_words.txt')
    jieba.analyse.set_idf_path("./external/idf.txt.big")
    
def cutSentence(sentences):
    return [list(jieba.cut(sentence, cut_all=False)) for sentence in sentences.split(' ')]
    
def rmStopWords(sentences):
    for sentence in sentences:
        for term in sentence:
            if term in stopwords:
                sentence.remove(term)
        if sentence == []:
            sentences.remove(sentence)
    return sentences

def processString(string):
    return rmStopWords(cutSentence(' '.join(re.findall(r'[\u4e00-\u9fff]+', string))))

def processTitle(title):
    return processString(title)

def processContent(content):
    return processString(content)

def processComment(comments):
    for i, comment in enumerate(comments):
        comments[i] = processString(comment)
    return comments

def processTag(tag):
    tag_list = []
    tags = tag.split(',')
    for t in tags:
        t = re.findall(r'[\u4e00-\u9fff]+', t)
        tag_list+=t
    return tag_list

def processUrl(url):
    return re.sub(r'[\n]', '', url)

def processPush(push_list):
    return [processString(push) for push in push_list]

def processBlogs():
    path='/tmp2/ten/new_post_pickles/'
    filenames = [f for f in listdir(path)]
    blogs = []
    for filename in filenames:
        doc = pickle.load(open( path+filename, "rb" ))
        doc['title'] = processTitle(doc['title'])
        doc['comment'] = processComment(doc['comment'])
        doc['content'] = processContent(doc['content'])
        if 'tag' in doc.keys():
            doc['tag'] = processTag(doc['tag'])
            if doc['tag'] != []:
                firstTag = ' '.join(list(jieba.cut(doc['tag'][0], cut_all=False)))

                cutIndex = len(doc['content'])
                for i in range(len(doc['content'])-1, -1, -1):
                    if doc['content'][i] == firstTag:
                        cutIndex = i
                        break
                doc['content'] = doc['content'][:cutIndex]

        doc['url'] = processUrl(doc['url'])
        blogs.append(doc)
    return blogs

def processPtts():
    path_S='/tmp2/GorsachiusMelanolophus/ptt_posts_new/sponsored/'
    path_notS='/tmp2/GorsachiusMelanolophus/ptt_posts_new/no_sponsored/'
    
    filenames_S = [f for f in listdir(path_S)]
    filenames_notS = [f for f in listdir(path_notS)]
    ptts = [None]*(len(filenames_S)+len(filenames_notS))
    for i, filename in enumerate(filenames_S):
        if filename == 'test.py':
            continue
        doc = pickle.load(open( path_S+filename, "rb" ))
        doc['href'] = 'https://www.ptt.cc' + doc['href']
        doc['title'] = processTitle(doc['title'])
        doc['content'] = processContent(doc['content'])
        cutIndex = len(doc['content'])
        for i in range(len(doc['content'])-1, -1, -1):
            if '轉錄' in doc['content'][i]:
                cutIndex = i
        doc['content'] = doc['content'][:cutIndex]
        doc['push_contents'] = processPush(doc['push_contents'])
        ptts[int(filename[:-2])] = doc
    for i, filename in enumerate(filenames_notS):
        doc = pickle.load(open( path_notS+filename, "rb" ))
        doc['href'] = 'https://www.ptt.cc' + doc['href']
        doc['title'] = processTitle(doc['title'])
        doc['content'] = processContent(doc['content'])
        cutIndex = len(doc['content'])
        for i in range(len(doc['content'])-1, -1, -1):
            if '轉錄' in doc['content'][i]:
                cutIndex = i
        doc['content'] = doc['content'][:cutIndex]
        doc['push_contents'] = processPush(doc['push_contents'])
        ptts[len(filenames_S)+int(filename[:-2])] = doc
    return ptts

def terms2Vec(terms):
    vec = np.zeros(len(embeddings[0]))
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运'
        if ID == None:
            vec += embeddings[0]
        else:
            vec += embeddings[ID]
    vec /= len(terms)
    return vec

def getTrainingData(ptts):
    X = np.zeros((len(ptts), max_sentences_num, len(embeddings[0])))
    y = []
    for i in range(len(ptts)):
        for j,terms in enumerate(ptts[i]['content']):
            X[i][j] = embeddings[startS]+terms2Vec(terms)+embeddings[endS]
        if ptts[i]['isSponsoredPost'] == True:
            y.append([1,0])
        else:
            y.append([0,1])
    y = np.asarray(y)
    return X, y
            
def getTestingData(blogs):
    X = np.zeros((len(blogs), max_sentences_num, len(embeddings[0])))
    y = []
    for i in range(len(blogs)):
        for j,terms in enumerate(blogs[i]['content']):
            X[i][j] = embeddings[startS]+terms2Vec(terms)+embeddings[endS]
        X.append(np.asarray(padding(vecIDs)))
    return X, y


max_sentences_num = 1000
setExternelDicts()
stopwords = [line.rstrip('\n') for line in open('./external/stopwords-zh.txt')]
blogs = processBlogs() 

ptts = processPtts()

# --------- Load word embedding --------- #
words, embeddings = pickle.load(open('/tmp2/eee/polyglot-zh.pkl', 'rb'), encoding='latin1')
print ('%d Zh word embeddings are loaded.' % len(words))
word2id = { w:i for (i,w) in enumerate(words) }
startS = word2id['<S>']
endS = word2id['</S>']
pad = word2id['<PAD>']
maxL = 4776

Building prefix dict from /nfs/undergrad/03/b03902024/2017IRFPJ/external/dict.txt.big ...
Loading model from cache /tmp/jieba.u01d2a77556a057401286ff132b8bfed4.cache
Loading model cost 2.448 seconds.
Prefix dict has been built succesfully.


100004 Zh word embeddings are loaded.


In [97]:
path_S='/tmp2/GorsachiusMelanolophus/ptt_posts_new/sponsored/'
path_notS='/tmp2/GorsachiusMelanolophus/ptt_posts_new/no_sponsored/'
filenames_S = [f for f in listdir(path_S)]
sN = len(filenames_S)
filenames_notS = [f for f in listdir(path_notS)]
notsN = len(filenames_notS)

In [71]:
print(len(X_train[:sN]))
print(len(X_train[sN:]), len(X_valid))

5603
5319
4682


In [None]:
# X_train, y_train = getTrainingData(ptts[:int(0.7*len(ptts))])
# X_valid, y_valid = getTrainingData(ptts[int(0.7*len(ptts)):])
# S_X, S_y = X_train[:sN], y_train[:sN]
# notS_X, notS_y = np.concatenate((X_train[sN:], X_valid)), np.concatenate((y_train[sN:], y_valid))


In [82]:
S_X, S_y = getTrainingData(ptts[:sN])
notS_X, notS_y = getTrainingData(ptts[sN:])

In [86]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(S_X.shape, S_y.shape)
print(notS_X.shape, notS_y.shape)

(10922, 1000, 64) (10922, 2)
(4681, 1000, 64) (4681, 2)
(5603, 1000, 64) (5603, 2)
(10000, 1000, 64) (10000, 2)


In [87]:
X_train_new, y_train_new = np.concatenate((S_X[:int(0.7*len(S_X))], notS_X[:int(0.7*len(notS_X))])), np.concatenate((S_y[:int(0.7*len(S_y))], notS_y[:int(0.7*len(notS_y))]))
X_valid_new, y_valid_new = np.concatenate((S_X[int(0.7*len(S_X)):], notS_X[int(0.7*len(notS_X)):])), np.concatenate((S_y[int(0.7*len(S_y)):], notS_y[int(0.7*len(notS_y)):]))

In [88]:
print(X_train_new.shape, y_train_new.shape)
print(X_valid_new.shape, y_valid_new.shape)

(10922, 1000, 64) (10922, 2)
(4681, 1000, 64) (4681, 2)


In [None]:
pickle.dump([blogs, ptts], open( "/tmp2/GorsachiusMelanolophus/afterProcessing/newBlogs_newPTTs.p", "wb" ))

In [92]:
pickle.dump(X_train_new[:int(len(X_train_new)/2)], open( "/tmp2/GorsachiusMelanolophus/afterProcessing/newBlogs_newPTTs_senAvg_train1.p", "wb" ))
pickle.dump(X_train_new[int(len(X_train_new)/2):], open( "/tmp2/GorsachiusMelanolophus/afterProcessing/newBlogs_newPTTs_senAvg_train2.p", "wb" ))

In [93]:
pickle.dump([y_train_new, X_valid_new, y_valid_new, embeddings], open( "/tmp2/GorsachiusMelanolophus/afterProcessing/newBlogs_newPTTs_senAvg_noXtrain.p", "wb" ))

In [54]:
len(ptts), len(blogs)

(15604, 8671)

In [None]:
#[blogs, ptts, X_train, y_train, X_valid, y_valid, embeddings] = pickle.load(open("/tmp2/GorsachiusMelanolophus/afterProcessing/blogs_ptts_sen_avg_newblogs.p", "rb"))

In [64]:
a = np.asarray([1,2,3,4,5,6,1,2,3,4,5,6])
a.resize(2,2,3)
b = np.asarray([1,2,3,4,5,6,1,2,3,4,5,6])
b.resize(2,2,3)
print('a',a)
print('b',b)
print(np.concatenate((a,b)).shape)

a [[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]
b [[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]
(4, 2, 3)


In [None]:
print('baseline:', isSponsered_list.count(0)/len(ptts))
for word in ['試吃', '廠商', '合作', '邀約', '邀稿', '哇']:
    print(word+':', sum([1 for a,b in zip(isSponsered_list, hasWord_list(word)) if a == b]) / len(ptts))

In [60]:
sentences = [['first', 'sentence'], ['second', 'sentence']]
model = gensim.models.Word2Vec(sentences, min_count=1)

In [23]:
model.wv['first']

array([ -4.60046995e-03,  -7.77644527e-05,  -3.48091591e-03,
        -1.45446544e-03,   2.42910930e-03,   6.14786404e-05,
         2.99958792e-03,  -1.65786792e-03,  -4.49421583e-03,
         2.69324542e-03,  -6.30077775e-05,   3.51313362e-03,
        -3.15759680e-03,   1.17169262e-03,  -2.98711169e-03,
        -3.37286503e-03,   4.55837278e-03,  -1.77195738e-03,
        -1.81711488e-03,   3.88350803e-03,  -1.64524664e-03,
         4.25767526e-03,  -1.23760244e-03,  -1.12451136e-03,
         4.56460984e-03,   2.12160405e-03,   1.87750976e-03,
         1.54259242e-03,   2.26992904e-03,  -4.71394602e-03,
         4.97746328e-03,   4.26378427e-03,   4.73562768e-03,
        -2.01302324e-03,   6.12006290e-04,   3.53679038e-03,
        -3.13258497e-03,   1.69070170e-03,  -1.35397946e-03,
        -3.52162635e-03,  -2.52298499e-03,  -5.68555726e-04,
         4.50787926e-03,   3.01287253e-03,   2.08228198e-03,
        -1.83358591e-03,   4.72083036e-03,   2.74113007e-03,
         3.50522995e-03,

In [94]:
5603/15603

0.3590976094340832