In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from matplotlib import font_manager, rc
import platform
import re

if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
    
else:    
# Mac 인 경우
    rc('font', family='AppleGothic')
    
plt.style.use('ggplot')
import datetime
import os
np.random.seed(0)
os.environ["PYTHONHASHSEED"] = "123"


In [2]:
## 제공 데이터
train = pd.read_csv('train_after_processing_weatheropt3.csv',header = 0,index_col=0, encoding='cp949') 
test = pd.read_csv('test_after_processing_weatheropt3.csv',header = 0,index_col=0, encoding='cp949')


In [3]:
train = train.query('상품군 != "무형" & 취급액 != 0 & 취급액 != 50000' ).reset_index()
test = test.query('상품군 != "무형"').reset_index()
target = train.취급액
orgin_tr_index = train.loc[:,'index']
orgin_te_index = test.loc[:,'index'] 

train.drop(columns = ['index','취급액','브랜드','시청률'], axis = 1, inplace = True)
test.drop(columns = ['index','취급액','브랜드'], axis = 1, inplace = True)

In [4]:
def get_merge(df, n, columns) :
    merge_list = []
    for i in range(len(df)) :
        value = list(df.loc[i,columns].values)
        #for j in range(len(value)) :
        #    if '/' in value[j] :
        #        v = re.split('[/]', value[j])
        #        for k in range(len(v)) :
        #            value.insert(j+k+1, v[k])
        #        value.pop(j)
        #    else :
        #        continue
        for h in range(n) :
            value = list(np.append(value, np.random.choice(value, len(value), replace = False)))
        merge_list.append(value)
    
    
    return merge_list

In [5]:
columns_name = ['상품군','대','중','소']

w2v_train = get_merge(train,3, columns_name)
w2v_test = get_merge(test, 3, columns_name)

In [6]:
np.append(columns_name, np.random.choice(columns_name, 3))

array(['상품군', '대', '중', '소', '중', '중', '중'], dtype='<U3')

In [7]:
num_features = 200 # 단어 벡터 차원 수
min_word_count = 3 # 최소 단어 수
context = 5 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(w2v_train, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=5, workers=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)

### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기(pipeline에서 사용 가능)
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([
                np.max([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.min([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0),
                np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)                
            ]) 
            for words in X
        ])  
    

In [8]:
w2v_tr_feature = EmbeddingVectorizer(w2v).transform(w2v_train)
w2v_tr_feature = pd.DataFrame(w2v_tr_feature, columns = [f'w2v_{i}' for i in range(w2v_tr_feature.shape[1])])
w2v_te_feature = EmbeddingVectorizer(w2v).transform(w2v_test)
w2v_te_feature = pd.DataFrame(w2v_te_feature, columns = [f'w2v_{i}' for i in range(w2v_te_feature.shape[1])])

del w2v_train, w2v_test

In [9]:
w2v_te_feature

Unnamed: 0,w2v_0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,...,w2v_590,w2v_591,w2v_592,w2v_593,w2v_594,w2v_595,w2v_596,w2v_597,w2v_598,w2v_599
0,0.030390,0.092093,0.009445,0.017163,0.023503,-0.007776,0.040043,0.058097,0.065875,0.061698,...,-0.021553,-0.034423,-0.034922,0.006635,-0.003994,-0.045407,0.012381,0.053402,0.011623,0.021604
1,0.030390,0.092093,0.009445,0.017163,0.023503,-0.007776,0.040043,0.058097,0.065875,0.061698,...,-0.021553,-0.034423,-0.034922,0.006635,-0.003994,-0.045407,0.012381,0.053402,0.011623,0.021604
2,0.030390,0.092093,0.009445,0.017163,0.023503,-0.007776,0.040043,0.058097,0.065875,0.061698,...,-0.021553,-0.034423,-0.034922,0.006635,-0.003994,-0.045407,0.012381,0.053402,0.011623,0.021604
3,0.043983,0.072587,0.058418,-0.015854,0.023503,0.008291,0.040043,0.055475,0.023827,0.112152,...,0.003090,0.041353,-0.045826,0.052403,0.054178,-0.063173,0.066553,-0.056718,0.033996,-0.025931
4,0.043983,0.072587,0.058418,-0.015854,0.023503,0.008291,0.040043,0.055475,0.023827,0.112152,...,0.003090,0.041353,-0.045826,0.052403,0.054178,-0.063173,0.066553,-0.056718,0.033996,-0.025931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2711,0.057290,0.072587,0.080437,-0.022265,0.023503,0.008291,0.040043,0.055475,0.072032,0.112152,...,-0.007665,0.044492,-0.056699,0.063390,0.039945,-0.065772,0.065603,-0.054321,0.039923,-0.019448
2712,0.057290,0.072587,0.080437,-0.022265,0.023503,0.008291,0.040043,0.055475,0.072032,0.112152,...,-0.007665,0.044492,-0.056699,0.063390,0.039945,-0.065772,0.065603,-0.054321,0.039923,-0.019448
2713,0.057290,0.072587,0.080437,-0.022265,0.023503,0.008291,0.040043,0.055475,0.072032,0.112152,...,-0.007665,0.044492,-0.056699,0.063390,0.039945,-0.065772,0.065603,-0.054321,0.039923,-0.019448
2714,0.030390,0.006819,-0.001556,0.116686,-0.036830,0.018148,0.169778,0.070041,0.072283,-0.007472,...,-0.023808,-0.008050,0.018509,0.038589,0.029039,0.036532,-0.010742,0.016937,0.074555,0.023859


In [10]:
w2v_tr_feature.to_csv('w2v_tr_feature.csv', index = False, encoding = 'cp949')
w2v_te_feature.to_csv('w2v_te_feature.csv', index = False, encoding= 'cp949')