In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import nltk
import pandas as pd
from ast import literal_eval
from collections import Counter
from scipy.optimize import fmin_l_bfgs_b
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import multiprocessing
import random
from tqdm import tqdm
import optimizeTopicVectors as ot
import time



In [3]:
# hoseong's class
from STMD import *
from ASUM import *
from ASUM_Embedding import *
from preprocess import *

In [4]:
# load data
# work_path = "/media/hs-ubuntu/data/dataset/MasterThesis/STMD_data/"
work_path = "E:/dataset/MasterThesis/STMD_data/"

data = pd.read_csv(work_path + "preprocess_complete_Electronics.csv")

In [5]:
def extract_sample(data, brand_name, count, ratio = 1, random_state = 42):
    brand = data[data['brand']==brand_name]
    pos_reviews = brand[brand.overall >= 4]
    neg_reviews = brand[brand.overall <= 2]
    if ratio == 1:
        pos_sample = pos_reviews.sample(count, random_state=random_state)
        neg_sample = neg_reviews.sample(count, random_state=random_state)
        df = pd.concat([pos_sample, neg_sample], axis=0)
    else:
        df = brand.sample(count * 2, random_state = random_state)
    df['preprocessed'] = df.preprocessed.apply(lambda row: literal_eval(row))
    return df

In [6]:
# Apple 리뷰에서 긍정, 부정인 문서만
apple = data[data.brand == "Apple"]
pos_reviews = apple[apple.overall >= 4]
neg_reviews = apple[apple.overall <= 2]
df = pd.concat([pos_reviews, neg_reviews], axis=0)
df['preprocessed'] = df.preprocessed.apply(lambda row: literal_eval(row))
df.reset_index(drop=True, inplace=True)

In [7]:
df.to_csv(work_path + "apple_pos_neg.csv", index=False)
print(df[df.overall >= 4].shape)
print(df[df.overall <= 2].shape)

(3307, 12)
(1324, 12)


In [8]:
df = pd.read_csv(work_path + "apple_pos_neg.csv")
df['preprocessed'] = df.preprocessed.apply(lambda row: literal_eval(row))

In [9]:
# prepare
sentence_list, review_label, numSentence = prepare(df, max_sentence = 50)

In [10]:
documents, sentence_list_again, bigram, documents_label\
= bigram_and_sentence(sentence_list, review_label, numSentence, max_vocab=5000, threshold = 10, min_count = 10)



In [114]:
asum_embedding = ASUM_Embedding(review_label, wordVectors, sentimentVector, numTopics=topic, alpha=0.01, beta=0.001, gamma=1, binary=0.8, numSentiments=2)
asum_embedding._initialize_(sentence_list_again)

In [115]:
#Save WordList
save_path = "E:/dataset/MasterThesis/STMD_data/apple_pos_neg_data_txt/"
from collections import OrderedDict
dic = OrderedDict(sorted(asum_embedding.word2idx.items(), key=lambda t: t[1]))
word_list = []
for key in dic.keys():
    word_list.append(key)
with open(save_path + "WordList.txt", 'w') as f:
    for word in word_list:
        f.writelines(word + "\n")

In [117]:
#Save BagOfSentence
with open(save_path + "BagOfSentences.txt", 'w') as f:
    for doc in range(asum_embedding.numDocs):
        f.write(str(len(asum_embedding.doc_sent_word_dict[doc])))
        f.write("\n")
        for sent in asum_embedding.doc_sent_word_dict[doc]:
            for word in sent:
                f.write(str(word) + ' ')
            f.write("\n")

In [118]:
#Save document pos_neg label
with open(save_path + "ReviewLabel.txt", 'w') as f:
    for label in asum_embedding.review_label:
        f.writelines(str(label) + "\n")

In [11]:
# word / sentiment embedding

window = [5]
size = [300]
passes = 10
for w in window:
    for s in size:
        model = Doc2Vec(dm=1, dm_mean=1, min_count=0, sample=1e-5,
                        window=w, size=s, 
                        workers=8, batch_words = 10000,
                        alpha=0.025, min_alpha=0.025)
        model.build_vocab(documents)

        for epoch in tqdm(range(passes)):
            random.shuffle(documents)
            model.train(documents)
            model.alpha -= 0.002  # decrease the learning rate
            model.min_alpha = model.alpha  # fix the learning rate, no decay
#             if (epoch + 1) % 5 ==0:
#                 model.save(model_path + 'model_' + str(w) + '_' + str(s) + '_' + str(epoch+1))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:00<00:00,  6.10s/it]


In [12]:
model.most_similar([model.docvecs["positive"]], topn=30)

[('mini', 0.9176653027534485),
 ('love', 0.8881936073303223),
 ('carri', 0.8786433935165405),
 ('portabl', 0.8713654279708862),
 ('perform', 0.8606942892074585),
 ('perfect_size', 0.859289824962616),
 ('lighter', 0.8579401969909668),
 ('hand', 0.8556053638458252),
 ('improv', 0.8472954034805298),
 ('game', 0.8441768884658813),
 ('great', 0.843303382396698),
 ('speed', 0.8419391512870789),
 ('mac', 0.8309455513954163),
 ('learn_curv', 0.8285374641418457),
 ('overal', 0.8266509175300598),
 ('batteri_life', 0.8249247074127197),
 ('size', 0.8249083757400513),
 ('perfect', 0.8246691226959229),
 ('make', 0.8231087327003479),
 ('inch', 0.8175567984580994),
 ('storag', 0.8105429410934448),
 ('far', 0.8104313611984253),
 ('netflix', 0.8091338276863098),
 ('book', 0.8081627488136292),
 ('much_better', 0.8078503012657166),
 ('full_size', 0.8052796721458435),
 ('beauti', 0.804874837398529),
 ('pictur', 0.8045446276664734),
 ('crisp', 0.8042056560516357),
 ('stereo', 0.8037182688713074)]

In [13]:
model.most_similar([model.docvecs["negative"]], topn=30)

[('return', 0.9267991781234741),
 ('amazon', 0.9159002900123596),
 ('defect', 0.9103671908378601),
 ('contact', 0.8952794075012207),
 ('stop_work', 0.8915157318115234),
 ('problem', 0.8774918913841248),
 ('item', 0.8749350309371948),
 ('credit_card', 0.874808132648468),
 ('seller', 0.8740531206130981),
 ('warranti', 0.8658891916275024),
 ('refurbish', 0.8651744723320007),
 ('told', 0.8651559352874756),
 ('sent', 0.8596659898757935),
 ('repli', 0.8532638549804688),
 ('seal', 0.8506012558937073),
 ('receiv', 0.845376193523407),
 ('water', 0.8452926874160767),
 ('said', 0.8445823192596436),
 ('state', 0.8386708498001099),
 ('origin', 0.8376731872558594),
 ('descript', 0.83652263879776),
 ('fail', 0.8317441940307617),
 ('not_regist', 0.8207634091377258),
 ('not_buy', 0.8176717162132263),
 ('repair', 0.8176360130310059),
 ('broke', 0.8165550827980042),
 ('ask', 0.8148546814918518),
 ('vendor', 0.8146303296089172),
 ('appl_store', 0.8134809136390686),
 ('charg', 0.8110567927360535)]

### test 1

In [14]:
model.save("E:/dataset/MasterThesis/gensim_models/test1")

In [15]:
wordVectors = np.zeros((len(model.index2word), model.vector_size))
for index, word in enumerate(model.index2word):
    wordVectors[index,:] = model[word]

sentimentVector = np.zeros((2, model.vector_size))
sentimentVector[0,:] = model.docvecs['positive']
sentimentVector[1,:] = model.docvecs['negative']

In [19]:
topic = 30
binaries = [0, 0.2, 0.4, 0.6, 0.8, 1]
for binary in binaries:
    print("ratio is ", binary)
    asum_embedding_path = "E:/dataset/MasterThesis/Models/ASUM_embedding_topic_" + str(topic)
    asum_embedding = ASUM_Embedding(review_label, wordVectors, sentimentVector, numTopics=topic, alpha=0.1, beta=0.001, gamma=1, binary=binary, numSentiments=2)
    asum_embedding._initialize_(sentence_list_again)
    asum_embedding.run(save_path=asum_embedding_path, print_iter=5, save_iter = 50, maxIters= 50)

ratio is  0
iteration 1, time 28
iteration 2, time 28
iteration 3, time 28
iteration 4, time 29
Starting iteration 5 of 50:
0.75145756856
iteration 5, time 28
iteration 6, time 29
iteration 7, time 28
iteration 8, time 28
iteration 9, time 28
Starting iteration 10 of 50:
0.731591448931
iteration 10, time 28
iteration 11, time 28
iteration 12, time 28
iteration 13, time 28
iteration 14, time 28
Starting iteration 15 of 50:
0.735478298424
iteration 15, time 28
iteration 16, time 28
iteration 17, time 28
iteration 18, time 28
iteration 19, time 28
Starting iteration 20 of 50:
0.725977110775
iteration 20, time 28
iteration 21, time 28
iteration 22, time 28
iteration 23, time 28
iteration 24, time 28
Starting iteration 25 of 50:
0.724897430361
iteration 25, time 28
iteration 26, time 28
iteration 27, time 28
iteration 28, time 28
iteration 29, time 28
Starting iteration 30 of 50:
0.722738069531
iteration 30, time 28
iteration 31, time 28
iteration 32, time 28
iteration 33, time 28
iteration