In [1]:
# CORPUS 만들기
from gensim import corpora, models, similarities
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import tensorflow as tf
from pprint import pprint



In [2]:
df = pd.read_csv('total3.csv')

In [11]:
doc = df.drop(['macaddr', 'y'], axis=1).as_matrix()

In [12]:
doc = np.array(doc, dtype=str)

---
## 여기는 LSA model

In [5]:
# 날자별로 다른 문서로 만들기위해 리쉐입
doc = doc.reshape([-1,24])

In [6]:
doc.shape

(153790, 24)

In [55]:
# generate corpus
dictionary = corpora.Dictionary(doc)
cps = [dictionary.doc2bow(text) for text in doc] # corpus
# print(dictionary.token2id)
print(dictionary)

Dictionary(1054 unique tokens: ['442', '194', '244', '430', '839']...)


In [56]:
# tf-idf tansformation
tfidf = models.TfidfModel(cps)
cps_tfidf = tfidf[cps] # tfidf로 치환된 sparse vectors
print(tfidf)

TfidfModel(num_docs=153790, num_nnz=720092)


In [57]:
# LSA/LSI transformation
dim = 24
lsi = models.LsiModel(cps_tfidf, id2word=dictionary, num_topics=dim)
cps_lsi = lsi[cps_tfidf] # num_topic n-dim으로 치환된 코퍼스 

# 코퍼스는 메모리 아끼려고 콜할때 완성되므로 미리 매트릭스 만듦
mtx_lsi = []
for el in cps_lsi:
    mtx_lsi.append([v[1] for v in el])

In [58]:
len(mtx_lsi)

153790

In [62]:
mtx_lsi2 = np.array(mtx_lsi).reshape([-1,26*24])

In [69]:
df_lsi = pd.DataFrame(mtx_lsi2, columns=['lsi%d'%idx for idx in range(mtx_lsi2.shape[1])])
df_lsi = pd.concat([df['macaddr'], df_lsi, df['y']], axis=1)

In [70]:
df_lsi.to_csv('lsi.csv', index=False)

---
## 여기는 W2V model

In [14]:
doc.shape

(5915, 624)

In [15]:
sentences = []
for idx, el in enumerate(doc): #어근 분류한것
# for idx, el in enumerate(comments): # 그냥 넣은것
#     sentences.append(models.doc2vec.LabeledSentence(words=el, tags=[df['y'][idx]]))
    sentences.append(models.doc2vec.LabeledSentence(words=el, tags=['r%d'%idx]))    

In [16]:
%%time
# doc2vec training session
# 너무 sparse하면 또 학습이 잘 안됨 50~100정도가 적당...
size=24
model = models.Doc2Vec(size=size, window=7, min_count=1, workers=8,alpha=0.025, min_alpha=0.025)

model.build_vocab(sentences) # 오로지 한번만 스트럭팅 가능
# 너무 오래 트레이닝하면 오히려 벡터가 망가짐...??
for epoch in range(15):
    if epoch%1 ==0:
        print('epoch:',epoch, 'train completed')
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

epoch: 0 train completed
epoch: 1 train completed
epoch: 2 train completed
epoch: 3 train completed
epoch: 4 train completed
epoch: 5 train completed
epoch: 6 train completed
epoch: 7 train completed
epoch: 8 train completed
epoch: 9 train completed
epoch: 10 train completed
epoch: 11 train completed
epoch: 12 train completed
epoch: 13 train completed
epoch: 14 train completed
Wall time: 2min 35s


In [99]:
doc = doc.reshape([-1,24])
doc.shape

(153790, 24)

In [100]:
# 이거는 문서 전체 평균내는거
mtx = []
for i in doc:
    row = []
    for j in i:
        row.append(model.wv[j])
    row = np.array(row)
    row = row.mean(axis=0)
    mtx.append(row)

In [101]:
mtx = np.array(mtx)

In [102]:
mtx.shape

(153790, 24)

In [103]:
mtx = mtx.reshape([-1,26*24])

In [104]:
df_w2v = pd.DataFrame(mtx, columns=['w2v%d'%idx for idx in range(mtx.shape[1])])
df_w2v = pd.concat([df['macaddr'], df_w2v, df['y']], axis=1)

In [105]:
df_w2v.to_csv('w2v.csv', index=False)

---
여기는 3d CRNN용

In [22]:
doc.shape
doc2= doc.flatten()

In [24]:
doc3 = [model.wv[i] for i in doc2]

In [28]:
doc3 = np.array(doc3)

In [29]:
doc3.shape

(3690960, 24)

In [30]:
doc2.shape

(3690960,)

In [42]:
doc4 = doc3.reshape(-1, 26, 24, 24)
# doc4 = doc3.reshape(-1, 26*24*50)

In [43]:
doc4.shape

(5915, 26, 24, 24)

In [45]:
doc4 = np.transpose(doc4, (0,2,3,1))

In [47]:
doc4 = doc4.flatten()
doc4 = doc4.reshape(-1, 24*24*26)

In [48]:
df_w2v2 = pd.DataFrame(doc4, columns=['w2v%d'%idx for idx in range(doc4.shape[1])])
df_w2v2 = pd.concat([df['macaddr'], df_w2v2, df['y']], axis=1)
df_w2v2.to_csv('w2v_crnn.csv', index=False)