In [1]:
import sys
 
class Printer():
    """
    Print things to stdout on one line dynamically
    """
    def __init__(self,data):
        sys.stdout.write("\r\x1b[K"+data.__str__())
        sys.stdout.flush()

In [1]:
from raw_preprocess_util import *
import time


data_path = "/media/hs-ubuntu/data/dataset/Amazon/"
work_path = "/media/hs-ubuntu/data/dataset/MasterThesis/"
save_path = "/media/hs-ubuntu/data/dataset/MasterThesis/STMD_data/"


category_list = ["Electronics","Beauty","Clothing_Shoes_and_Jewelry"]

for category in category_list:
    raw_data_path = data_path + "reviews_" + category + ".json.gz"
    meta_data_path = data_path + "meta_" + category + ".json.gz"

    print("Start loading %s raw data" % category)
    start = time.time()
    raw_data = load_data(raw_data_path, year=2013)
    end = time.time()
    print("Completed loading %s raw data, time : %2.f" % (category, end - start))


    print("Start loading %s meta data", category)
    start = time.time()
    meta_data = load_meta(meta_data_path)
    end = time.time()
    print("Completed loading %s meta data, time : %2.f" % (category, end - start))


    print("Start join raw and meta of %s" % category)
    start = time.time()
    join_data = join_meta_data(raw_data, meta_data)
    end = time.time()
    print("Completed join raw and meta data of %s, time : %2.f" % (category, end - start))

    # NaN 값을 제거하고, 중간에 값을 저장하기 위해 일단 저장 후 다시 불러옴
    join_data.to_csv(save_path + "join_" + category + ".csv", index=False)
    join_data = pd.read_csv(save_path + "join_" + category + ".csv")

    print("Start extract sentences of %s" % category)
    start = time.time()
    join_data = extract_sentence(join_data)
    end = time.time()
    print("Completed extract sentences of %s, time : %2.f" % (category, end - start))

    print("Start extract samples from data")
    start = time.time()
    top_brands_df, top_brands_list = top_brands(join_data)
    data_final = sample_data(top_brands_df, top_brands_list)
    end = time.time()
    print("Completed extract samples of %s, time : %2.f" % (category, end - start))

    print("check shape ----------")
    print("%s shape after sampling : %s, %s"  % (category, data_final.shape[0], data_final.shape[1]))

    # 중간 저장
    print("Save before pos tagging start")
    data_final.to_csv(save_path + "raw_" + category + ".csv", index=False)
    print("Save before pos tagging completed")


    # 형태소 분석
    print("Start pos tag of sentences in %s" % category)
    start = time.time()
    data_final['reviewSentence_tagged'] = data_final.reviewSentence.apply(sentence_postag)
    data_final.to_csv(save_path + "pos_tagged_" + category + ".csv", index=False)
    end = time.time()
    print("Completed pos-tagging and save of %s, time : %2.f" % (category, end - start))

    # 형태소 분석한거에다가 추가 전처리
    print("Start preprocessing in %s" % category)
    start = time.time()
    data_final['preprocessed'] = data_final.reviewSentence_tagged.apply(preprocessing)
    data_final.to_csv(save_path + "preprocess_complete_" + category + ".csv", index=False)
    end = time.time()
    print("Completed preprocess and save of %s, time : %2.f" % (category, end - start))


Start loading Electronics raw data
Completed loading Electronics raw data, time : 540
Start loading %s meta data Electronics
Completed loading Electronics meta data, time : 49
Start join raw and meta of Electronics
Completed join raw and meta data of Electronics, time :  6
Start extract sentences of Electronics
Completed extract sentences of Electronics, time : 496
Start extract samples from data
Completed extract samples of Electronics, time :  3
check shape ----------
Electronics shape after sampling : 250000, 10
Save before pos tagging start
Save before pos tagging completed
Start pos tag of sentences in Electronics
Completed pos-tagging and save of Electronics, time : 1571
Start preprocessing in Electronics
Completed preprocess and save of Electronics, time : 282
Start loading Beauty raw data
Completed loading Beauty raw data, time : 144
Start loading %s meta data Beauty
Completed loading Beauty meta data, time : 33
Start join raw and meta of Beauty
Completed join raw and meta data

In [None]:
# brand2vec용 전처리
from raw_preprocess_util import *
import time
import pickle
from ast import literal_eval
import nltk

data_path = "/media/hs-ubuntu/data/dataset/Amazon/"
work_path = "/media/hs-ubuntu/data/dataset/MasterThesis/"
save_path = "/media/hs-ubuntu/data/dataset/MasterThesis/STMD_data/"

category_list = ["Electronics","Beauty","Clothing_Shoes_and_Jewelry"]

for category in category_list:
    print("Start preprocessing in %s" % category)
    start = time.time()
    data_final = pd.read_csv(save_path + "pos_tagged_" + category + ".csv")
    data_final['preprocessed'] = data_final.reviewSentence_tagged.apply(lambda row: literal_eval(row))
    reviewSentence_tagged = data_final.reviewSentence_tagged.values.tolist()
    
    new_sent_list = []
    adjectives = []
    total_tokens = []
    for i in range(len(reviewSentence_tagged)):
        if (i + 1) % 10 == 0:
            Printer(i)
        sent, adjective, tokens=brand2vec_preprocess(reviewSentence_tagged[i])
        new_sent_list.append(sent)
        adjectives.extend(adjective)
        total_tokens.extend(tokens)
    
    # save 전체 단어 분포 / 형용사 단어 분포 dictionary
    corpus = nltk.Text(total_text)
    freq = nltk.FreqDist(corpus)
    with open(work_path + 'brand2vec_dist/' + category + '_total_freq_dist.pkl', 'wb') as f:
        pickle.dump(freq, f)
    total_text = [word for sent in total_tokens for word in sent]
    print(len(list(set(total_text))))
    
    # 형용사 분포
    corpus = nltk.Text(adjectives)
    freq = nltk.FreqDist(corpus)
    with open(work_path + 'brand2vec_dist/' + category + '_total_adjective_dist.pkl', 'wb') as f:
        pickle.dump(freq, f)
    
    data_final.drop(['reviewSentence','reviewSentence_tagged','preprocessed'], inplace=True)
    data_final.to_csv(save_path + "brand2vec_final_" + category + ".csv", index=False)
    end = time.time()
    print("Completed preprocess and save of %s, time : %2.f" % (category, end - start))

Start preprocessing in Electronics


In [54]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
