In [52]:
%load_ext autoreload
%autoreload
%matplotlib inline

import pandas as pd
import numpy as np
import _pickle as pickle
from IPython.core.debugger import set_trace
from tqdm import tqdm_notebook
import texcrapy
#from konlpy.corpus import word
from ckonlpy.tag import Twitter, Postprocessor
import json
from soynlp.word import WordExtractor
from soynlp.tokenizer import MaxScoreTokenizer, LTokenizer
from soynlp.noun import LRNounExtractor_v2
from soynlp.pos.tagset import tagset
from soynlp.postagger import Dictionary as Dict
from soynlp.postagger import LRTemplateMatcher
from soynlp.postagger import LREvaluator
from soynlp.postagger import SimpleTagger
from soynlp.postagger import UnknowLRPostprocessor

import nltk
from nltk import Text
from nltk.corpus import stopwords as STOPWORDS
from nltk.corpus import words as WORDS
from nltk.tag import untag
import math
import os
import re

# from threading import Thread
#from multiprocessing import Process
# import dask
# from dask import compute, delayed
# import dask.multiprocessing
# import dask.bag as db
# import jpype
from sklearn import preprocessing

import gensim
from gensim.models import Word2Vec, Doc2Vec, FastText
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
from gensim.models.doc2vec import TaggedDocument

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to C:\Users\sekan.CA-
[nltk_data]     AM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to C:\Users\sekan.CA-
[nltk_data]     AM\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Scraping

In [175]:
df = pd.read_excel('keywords and logos.xlsx', sheet_name='20190215')[['shortname','kw_supporter','kw_supported','keywords']]; df
_or = lambda kw: ' OR '.join(['#' + k.strip() for k in kw.split(',')])

qry_base = {row.shortname:_or(row.keywords) for row in df.itertuples()}
supporters = df.shortname[df.kw_supporter==True]
qry_sup = ' OR '.join([qry_base[sup] for sup in supporters]); qry_sup
qry = {row.shortname: '(' + qry_base[row.shortname] + ') AND (' + qry_sup + ')' if row.kw_supported==True else qry_base[row.shortname] for row in df.itertuples()}

In [None]:
%%time
what = ['id', 'timestamp', 'text']
texcrapy.scrap(qry, what=what, lang='ko', end='2019-01-31', download_to='scrapped/twitter')

# Making Corpus

In [54]:
def preproc(text, remove_url=True, remove_mention=True, remove_hashtag=False):
    LINEBREAK = r'\n' # str.replace에서는 r'\n'으로 검색이 안된다
    RT = '((?: rt)|(?:^rt))[^ @]?'
    EMOJI = r'[\U00010000-\U0010ffff]'
    DOTS = '…'
    LONG_BLANK = r'[ ]+'
    SPECIALS = r'([^ a-zA-Z0-9_\u3131-\u3163\uac00-\ud7a3]+)|([ㄱ-ㅣ]+)'
    
    # \u3131-\u3163\uac00-\ud7a3 는 한글을 의미함
    # URL = r'(?P<url>(https?://)?(www[.])?[^ \u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b([^ \u3131-\u3163\uac00-\ud7a3]*))'
    URL1 = r'(?:https?:\/\/)?(?:www[.])?[^ :\u3131-\u3163\uac00-\ud7a3]+[.][a-z]{2,6}\b(?:[^ \u3131-\u3163\uac00-\ud7a3]*)'
    URL2 = r'pic.twitter.com/[a-zA-Z0-9_]+'
    URL = '|'.join((URL1, URL2))
    
    HASHTAG = r'#(?P<inner_hashtag>[^ #@]+)'
    MENTION = r'@(?P<inner_mention>[^ #@]+)' 
    
    text = text.lower()
    
    if remove_url:
        text = re.sub(URL, ' ', text)

    if remove_mention:
        text = re.sub(MENTION, ' ', text)        
    else:
        text = re.sub(MENTION, ' \g<inner_mention>', text)
        
    if remove_hashtag:
        text = re.sub(HASHTAG, ' ', text)
    else:
        text = re.sub(HASHTAG, ' \g<inner_hashtag>', text)
        
    text = re.sub('|'.join((LINEBREAK, RT, EMOJI, DOTS, SPECIALS)), ' ', text)
    return re.sub(LONG_BLANK, ' ', text).strip()
    

class JsonCorpus:
    def __init__(self, *fnames, textkey='text'):
        self.fnames = fnames
        self.textkey = textkey
        self.corpus = self._corpus()
    
    def _corpus(self):
        corpus = {}
        nfiles = len(self.fnames)
        
        for i, fname in enumerate(self.fnames):
            with open(fname, encoding='UTF-8-sig') as f:                
                for item, docs in json.load(f).items():
                    corpus[item] = [preproc(doc[self.textkey]) for doc in docs]
                
                pct = '%.2f' % (100 * (i+1) / nfiles)
                print('\r {pct}% completed'.format(pct=pct), end='')
        
        print('\n')
        return corpus
    
                
    def __iter__(self):
        for sents in self.corpus.values():
            yield from sents

    def __len__(self):
        return sum([len(sents) for sents in self.corpus.values()])
    
    def tokenize(self, tagger):
        return DocTokens(tagger, **self.corpus)


    
class DocTokens:
    def __init__(self, tagger, **corpora):
        self.tagger = tagger
        self.tokensdict = self._get_tokens(**corpora)
    
        
    def _get_tokens(self, **corpora):
        tokens = {}
        for item, corpus in tqdm_notebook(list(corpora.items())[:]):
            tokens[item] = [[w[0] for w in self.tagger.tag(corp) if w[1] is not None] for corp in set(corpus)]
        return tokens
                    
    def __iter__(self):
        for item, toks in self.tokensdict.items():
            for _toks in toks:
                yield TaggedDocument(words=_toks, tags=[item])
            
    def __len__(self):
        return sum([len(toks) for toks in self.tokensdict.values()])

In [55]:
%%time
fnames = ['scrapped/twitter/' + fname for fname in os.listdir('scrapped/twitter')]
jcorpus = JsonCorpus(*fnames)

 100.00% completed

Wall time: 25.7 s


In [6]:
jcorpus.corpus['coteciel']

['140815 nayeon instagram update 집념의 한국인 구했다 잘메고다녀주마 꼬떼씨엘 coteetciel 백팩 coteetcielofficial',
 '140815 nayeon instagram update 집념의 한국인 구했다 잘메고다녀주마 꼬떼씨엘 coteetciel 백팩 coteetcielofficial',
 'nayeonstagram 집념의 한국인 구했다 잘메고다녀주마 꼬떼씨엘 coteetciel 백팩 coteetcielofficial',
 'nayeonstagram 집념의 한국인 구했다 잘메고다녀주마 꼬떼씨엘 coteetciel 백팩',
 '상위태그 3시간 1 도화선 2 친일파 3 패스트코 4 럭셔리 5 럭셔리그램 6 백스타그램 7 꼬떼씨엘 8 luxerystyle 9 애플 10 인용 11 어린이날',
 '상위태그 3시간 1 도화선 2 친일파 3 패스트코 4 럭셔리 5 quote 6 럭셔리그램 7 백스타그램 8 꼬떼씨엘 9 luxerystyle 10 지방재정개혁',
 '상위태그 3시간 1 도화선 2 친일파 3 luxerystyle 4 백스타그램 5 럭셔리 6 패스트코 7 애플 8 꼬떼씨엘 9 럭셔리그램 10 어린이날',
 'repost 패스트코 훼이스북에서 가져왔어요 꼬떼씨엘 애플 백스타그램 럭셔리 럭셔리_익스피리언스 럭셔리그램 luxerystyle',
 '패스트코 훼이스북에서 가져왔어요 꼬떼씨엘 애플 백스타그램 럭셔리 럭셔리_익스피리언스 럭셔리그램 luxerystyle']

# Making Dictionary

### 1. From Scraping keywords

In [58]:
df_keywords = pd.read_excel('keywords and logos.xlsx', sheet_name='20190304')['keywords']
keywords = {w.strip() for w in ', '.join(df_keywords).split(',')}

### 2. Soynlp nouns

In [59]:
noun_extractor = LRNounExtractor_v2(verbose=True)
_soynouns = noun_extractor.train_extract(jcorpus, min_noun_score=0.3, min_noun_frequency=5)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=1260, neg=1173, common=12
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 493329 from 429969 sents. mem=0.485 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=6680411, mem=1.576 Gb
[Noun Extractor] batch prediction was completed for 112092 words
[Noun Extractor] checked compounds. discovered 106245 compounds
[Noun Extractor] postprocessing detaching_features : 23448 -> 23277
[Noun Extractor] postprocessing ignore_features : 23277 -> 23178
[Noun Extractor] postprocessing ignore_NJ : 23178 -> 22851
[Noun Extractor] 22851 nouns (106245 compounds) with min frequency=5
[Noun Extractor] flushing was done. mem=1.816 Gb                    
[Noun Extractor] 75.02 % eojeols are covered


In [60]:
soynouns = _soynouns.keys()
soyngrams = {v for k,v in noun_extractor._compounds_components.items() if k in soynouns}

In [61]:
word_extractor = WordExtractor()
word_extractor.train(jcorpus)
_soywords = word_extractor.extract()

training was done. used memory 2.627 Gbse memory 2.781 Gb
all cohesion probabilities was computed. # words = 264323
all branching entropies was computed # words = 446112
all accessor variety was computed # words = 446112


In [62]:
def word_score(score):
    return score.cohesion_forward * math.exp(score.right_branching_entropy)

soywords = {word for word, score in _soywords.items() if word_score(score)>0.1}

### 3. Korean words

In [63]:
with open('dic_system.txt', encoding='UTF-8-sig') as f:
    lines = f.readlines()
    
kowords = {tok.split('\t')[0] for tok in lines}

### 4. English words

In [64]:
enwords = set(WORDS.words())

### 5. Custom words

In [65]:
cwords = '''
'''

cwords = set(re.findall(r'[^ ,]+', re.sub(r'\n', '', cwords)))

In [66]:
pos_dict = {
    'Adverb': {}, 
    'Noun': keywords | soynouns | soywords | kowords | enwords | cwords, 
    'Josa': {},
    'Verb': {},
    'Adjective': {},
    'Exclamation': {},
}

dictionary = Dict(pos_dict)
generator = LRTemplateMatcher(dictionary)    
evaluator = LREvaluator()
postprocessor = UnknowLRPostprocessor()
tagger = SimpleTagger(generator, evaluator, postprocessor)

In [67]:
tagger.tag(jcorpus.corpus['coteciel'][1]);

In [68]:
%%time
tokens = jcorpus.tokenize(tagger)

HBox(children=(IntProgress(value=0, max=327), HTML(value='')))


Wall time: 3min 20s


In [19]:
tokens.tokensdict['crocs'];

In [71]:
tokens.tokensdict['engineeredgarment']

[]

# Tagger 객체 저장

In [20]:
with open('model/tagger.pickle', 'wb') as f:
    pickle.dump(tagger, f)

In [6]:
with open('model/tagger.pickle', 'rb') as f:
    tagger = pickle.load(f)

# Doc2vec

In [26]:
d2v = Doc2Vec(tokens, vector_size=100, window=5, min_count=10, workers=4)
d2v.save('model/doc2vec.model')

In [61]:
d2v.init_sims(replace=True)

In [3]:
d2v = Doc2Vec.load('model/doc2vec.model')

In [40]:
d2v.docvecs.most_similar(positive=['nike'], negative=['adidas'])

[('supreme', 0.2928835153579712),
 ('acronym', 0.23972950875759125),
 ('underarmour', 0.21875041723251343),
 ('woolrich', 0.21489018201828003),
 ('8seconds', 0.20763862133026123),
 ('gyakusou', 0.20322123169898987),
 ('beanpole', 0.19406844675540924),
 ('poloralphlauren', 0.18710462749004364),
 ('saintlaurent', 0.18576635420322418),
 ('wilson', 0.18348200619220734)]

In [7]:
d2v.docvecs.most_similar(positive=['nike','adidas'], negative=['adidas', 'nike'])

[('prospecs', 0.10010688006877899),
 ('bally', 0.08569294959306717),
 ('teva', 0.08115921914577484),
 ('samsonite', 0.08079318702220917),
 ('sketchers', 0.08073700964450836),
 ('fila', 0.07536489516496658),
 ('asics', 0.07459881901741028),
 ('mandarinaduck', 0.07058194279670715),
 ('newbalance', 0.06999332457780838),
 ('salomon', 0.062280572950839996)]

1

In [7]:
def _simwords(what, n):
    toks = [w[0] for w in tagger.tag(what)]
    _toks = [w for w in toks if w in d2v.wv.vocab]
    sw = [w[0] for w in d2v.wv.most_similar(positive=_toks, topn=n)] + toks
    return set(sw)

def _diffwords(what, n):
    toks = [w[0] for w in tagger.tag(what)]
    _toks = [w for w in toks if w in d2v.wv.vocab]
    sw = [w[0] for w in d2v.wv.most_similar(negative=_toks, topn=n)]
    return set(sw)

def _simwords_vec(what, n):
    return [d2v.wv[w] for w in _simwords(what, n) if w in d2v.wv.vocab]

def _diffwords_vec(what, n):
    return [d2v.wv[w] for w in _diffwords(what, n) if w in d2v.wv.vocab]

In [37]:
# _simwords(searchkey, 30);

In [38]:
# _simwords_vec(searchkey, 30);

In [11]:
# %%time
searchkey = '럭셔리 가방'
poskey = [d2v.infer_vector(_simwords(searchkey, 30), epochs=300)]#, alpha=0.25)]
d2v.docvecs.most_similar(positive=poskey, topn=30)

[('pleasuresclothing', 0.7459573149681091),
 ('coteciel', 0.7457472085952759),
 ('ferragamo', 0.741430401802063),
 ('isseymiyake', 0.7241898775100708),
 ('tods', 0.7209071516990662),
 ('rimowa', 0.7207834720611572),
 ('bottegaveneta', 0.7076842784881592),
 ('samsonite', 0.7069516777992249),
 ('yojiyamamoto', 0.7044141292572021),
 ('pearlygates', 0.6999064087867737),
 ('pleatsplease', 0.6995478868484497),
 ('marceloburlon', 0.6986268162727356),
 ('montblanc', 0.6965562105178833),
 ('tartoptical', 0.693993330001831),
 ('thursdayisland', 0.6939331293106079),
 ('toryburch', 0.692609965801239),
 ('sandro', 0.6910876035690308),
 ('tumi', 0.6908586025238037),
 ('urutokyo', 0.6858264207839966),
 ('maje', 0.6835768818855286),
 ('jlindeberg', 0.6832438111305237),
 ('n21', 0.683185875415802),
 ('girlsdontcry', 0.6830391883850098),
 ('system', 0.6829767227172852),
 ('moscot', 0.6819743514060974),
 ('rokit', 0.6817741990089417),
 ('gstarraw', 0.6799370050430298),
 ('noah', 0.6799349784851074),
 ('c

In [9]:
poskey

[array([-8.6139017e-01,  1.3809034e+00, -3.6173780e+00, -4.3824415e+00,
        -6.7872268e-01, -2.3520727e+00,  4.4996619e-02,  2.7027228e+00,
        -1.2748922e+00, -2.9831495e+00, -6.1643362e-01, -3.4834541e-02,
         2.6739433e+00,  2.8210809e+00, -2.7420094e+00,  1.4950844e+00,
         5.2327985e-01, -1.1583772e+00, -1.3943038e+00, -1.3331316e+00,
         4.2071748e+00,  1.6728294e+00, -1.4668418e+00,  2.1192832e-02,
         2.0961373e-01,  6.7499959e-01,  2.3460326e+00, -2.1874182e+00,
         5.0290227e-01, -4.2099315e-01, -6.6759300e-01, -3.4276562e+00,
        -8.2263821e-01, -6.5828300e+00, -3.9288304e+00,  3.7724199e+00,
        -1.4596027e+00,  3.4219873e-01, -2.3048851e+00, -3.8439784e+00,
        -1.1404902e+00,  3.9055903e+00, -2.5293424e+00,  8.8097715e-01,
        -3.6117952e+00, -2.0248580e+00,  3.3790348e+00, -2.5603082e+00,
        -4.9714756e+00,  2.2904272e+00, -2.1371818e+00, -1.3951428e+00,
        -3.2699823e+00,  1.8848761e+00, -1.9107701e+00, -4.15830

In [15]:
# 요게 제일 나은듯

# %%time
searchkey = '럭셔리 가방'
poskey = _simwords_vec(searchkey, 30)
d2v.docvecs.most_similar(positive=poskey, topn=20)

[('isseymiyake', 0.773637056350708),
 ('bottegaveneta', 0.765144944190979),
 ('coteciel', 0.7550020217895508),
 ('ferragamo', 0.7382534742355347),
 ('toryburch', 0.7357887029647827),
 ('vivianwestwood', 0.7201783657073975),
 ('paulsmith', 0.7198200225830078),
 ('charleskeith', 0.7170199155807495),
 ('plasticisland', 0.7149162292480469),
 ('pleasuresclothing', 0.7138164043426514),
 ('moscot', 0.7100235819816589),
 ('coach', 0.7088214159011841),
 ('rimowa', 0.7085838913917542),
 ('pleatsplease', 0.7080258727073669),
 ('tods', 0.7078876495361328),
 ('ermenegildozegna', 0.7052038311958313),
 ('time', 0.7050747275352478),
 ('cherryla', 0.7040436267852783),
 ('pearlygates', 0.7014678120613098),
 ('balenciaga', 0.6943455934524536)]

In [54]:
%%time
d2v.docvecs.most_similar(positive=[d2v.wv['럭셔리']], topn=20)

Wall time: 9 ms


[('montblanc', 0.6617202758789062),
 ('sacai', 0.6157494783401489),
 ('ermenegildozegna', 0.6094779968261719),
 ('hermes', 0.609326183795929),
 ('nationalgeography', 0.601405143737793),
 ('balmain', 0.599767804145813),
 ('vivianwestwood', 0.5972344875335693),
 ('clubmonaco', 0.5735753774642944),
 ('tods', 0.5730624198913574),
 ('coteciel', 0.5692532062530518),
 ('yojiyamamoto', 0.5672576427459717),
 ('bottegaveneta', 0.5626093149185181),
 ('hugoboss', 0.552722156047821),
 ('maxmara', 0.5466232299804688),
 ('prada', 0.5454100370407104),
 ('moscot', 0.5445988178253174),
 ('pearlygates', 0.5401676297187805),
 ('dolcegabbana', 0.5382061004638672),
 ('bally', 0.538009524345398),
 ('monclear', 0.5378274321556091)]

In [55]:
d2v.docvecs.most_similar(positive=_simwords_vec(searchkey,30), negative=_diffwords_vec(searchkey,30), topn=300)

[('isseymiyake', 0.7528330087661743),
 ('bottegaveneta', 0.7475889921188354),
 ('coteciel', 0.7206209301948547),
 ('toryburch', 0.7173346281051636),
 ('ferragamo', 0.7162785530090332),
 ('vivianwestwood', 0.7113078832626343),
 ('plasticisland', 0.705762505531311),
 ('pearlygates', 0.6973544359207153),
 ('moscot', 0.693939208984375),
 ('paulsmith', 0.6909399628639221),
 ('rimowa', 0.6867916584014893),
 ('tods', 0.685397744178772),
 ('time', 0.6836609840393066),
 ('ermenegildozegna', 0.6833614110946655),
 ('charleskeith', 0.6809303760528564),
 ('pleasuresclothing', 0.6787161827087402),
 ('pleatsplease', 0.6782567501068115),
 ('coach', 0.6744545698165894),
 ('balenciaga', 0.6695948839187622),
 ('tumi', 0.6627806425094604),
 ('cherryla', 0.6617143750190735),
 ('maisonmargiela', 0.659013032913208),
 ('celine', 0.6581742763519287),
 ('alexandermcqueen', 0.6580557823181152),
 ('montblanc', 0.6545995473861694),
 ('system', 0.653503954410553),
 ('yojiyamamoto', 0.6517916917800903),
 ('versace',

# 유사 브랜드 찾기

In [43]:
d2v.wv.most_similar(positive=[d2v.docvecs['nike']])

[('축구선수', 0.750069797039032),
 ('footballer', 0.7410778403282166),
 ('운동선수', 0.7308176755905151),
 ('조던', 0.7287521958351135),
 ('운동부', 0.7230668067932129),
 ('justdoit', 0.7171881794929504),
 ('축구부', 0.7166173458099365),
 ('루나글라이드', 0.71532142162323),
 ('갸쿠소우', 0.7120037078857422),
 ('moreuptemp', 0.711184024810791)]

In [38]:
id_desc = {
    '럭셔리': '럭셔리 고급 호화 과시 명품 luxury 비싼 고가 expensive pricy pricey', 
    '캐주얼': '캐주얼 캐쥬얼 casual 스타일리시 스타일리쉬 stylish', 
    
    '유니크': '유니크 독특 독창 unique 개성 only 참신 신선 특이 아이디어 철학',
    '대중성': '대중 popular 널리 흔한 massive mass 대중성', 
    
    '정통성': '정통 클래식 classic 품격 약속 신뢰 믿음 예측 견고 품질 안정', #traditional 트레디셔널
    '트렌디': '트렌디 트랜디 트렌드 트랜드 유행 trend trendy 변화 새로운 민감 예민 신상 최신',
    
    '포멀': '포멀 formal 노멀 normal 평범 일상 무난 기본 베이스 베이직 base basic',
    '액티브': '화제 인기 hot 튀는 액티브 active 앞서가는 실험 과감 선도 선구 대담', 
    
    #'가성비': ['가성비','저렴','효율','성능','실용'], 
    #'신뢰성': ['신뢰','믿음','trust','견고','품질','안정','클래식','classic'],
    #'활동성': ['활동','활발','운동','액티브','스포츠','active','sport','sports','sporty'],
    #'과감함': ['과감','선도','선구','대담','강렬','선명','예술','art'],
    # 철학, 환상, 신상, 새로운, 
}

In [45]:
id_pairs = [('럭셔리','캐주얼'), ('유니크','대중성'), ('정통성','트렌디'), ('포멀','액티브')]

In [75]:
d2v.wv.cosine_similarities(d2v.docvecs['nike'], [d2v.infer_vector(['나이키'], epochs=500)])[0]

0.5551831

In [46]:
bid = {}
for _id, desc in id_desc.items():
#     val_tup = d2v.docvecs.most_similar(_simwords_vec(desc, 30), topn=400)
#     val_tup = d2v.docvecs.most_similar([d2v.infer_vector(_simwords(desc, 30), epochs=500)], topn=400)
    val_tup = d2v.docvecs.most_similar([d2v.infer_vector(desc.split(' '), epochs=500)], topn=400)
    
    for bname, val in val_tup:
        if bname in ['ootd','fashion','category']:
            continue
            
        if bname in bid:
            bid[bname][_id] = val
        else:
            bid[bname] = {_id:val}
            
bid = pd.DataFrame(bid).T

In [47]:
identity = bid.copy()

for pair in id_pairs:
    s0 = identity[pair[0]]
    s1 = identity[pair[1]]
    identity[pair[0]] = s0/(s0+s1)
    identity[pair[1]] = s1/(s0+s1)

scaler = preprocessing.MinMaxScaler(feature_range=(0.1, 1))
X = scaler.fit_transform(identity.T)
identity[:] = X.T
identity = (identity*100).astype(int)

In [50]:
identity.loc[['uniqlo','nike','chanel','gucci','hermes','bape']]

Unnamed: 0,대중성,럭셔리,액티브,유니크,정통성,캐주얼,트렌디,포멀
uniqlo,83,61,73,77,99,57,9,58
nike,79,100,73,31,75,10,33,33
chanel,40,100,73,71,64,10,59,25
gucci,53,100,45,33,62,9,29,37
hermes,59,99,27,44,73,9,36,59
bape,78,20,9,38,71,68,21,99


In [22]:
identity.loc[['uniqlo','nike','chanel','gucci','hermes','bape']]

Unnamed: 0,대중성,럭셔리,액티브,유니크,정통성,캐주얼,트렌디,포멀
uniqlo,20,33,46,83,100,88,10,77
nike,66,100,62,24,80,10,17,24
chanel,29,100,28,57,54,9,44,67
gucci,39,100,33,35,66,9,19,53
hermes,51,100,27,50,66,9,33,62
bape,65,76,9,49,98,25,12,100


In [34]:
identity.to_pickle('model/identity.pkl')

In [33]:
identity

Unnamed: 0,대중성,럭셔리,액티브,유니크,정통성,캐주얼,트렌디,포멀
coteciel,99,64,65,9,44,19,55,11
yojiyamamoto,86,99,72,30,59,10,64,15
pleasuresclothing,100,48,55,9,58,44,48,25
ermenegildozegna,63,95,100,55,39,11,91,10
plasticisland,84,31,100,30,33,73,84,10
moscot,76,100,88,49,54,10,77,23
marceloburlon,100,73,79,20,74,32,60,9
isseymiyake,56,99,73,58,59,9,69,44
n21,99,96,55,31,75,9,48,43
system,90,10,89,28,46,99,83,9


In [24]:
d2v.docvecs.most_similar('chanel', topn=400)

[('louisvuitton', 0.7209425568580627),
 ('hermes', 0.6719557046890259),
 ('dior', 0.6496135592460632),
 ('versace', 0.6361387372016907),
 ('tomford', 0.6261129975318909),
 ('marcjacobs', 0.5925168991088867),
 ('bottegaveneta', 0.5862009525299072),
 ('balenciaga', 0.5817146301269531),
 ('saintlaurent', 0.5796976089477539),
 ('isseymiyake', 0.5787030458450317),
 ('vivianwestwood', 0.5774768590927124),
 ('dolcegabbana', 0.5720545053482056),
 ('lanvin', 0.5660214424133301),
 ('toryburch', 0.5644780397415161),
 ('tods', 0.5641356706619263),
 ('celine', 0.5606419444084167),
 ('balmain', 0.5577264428138733),
 ('michaelkors', 0.5570850968360901),
 ('mulberry', 0.5569880604743958),
 ('montblanc', 0.5558047890663147),
 ('ferragamo', 0.5545101761817932),
 ('marni', 0.551742672920227),
 ('n21', 0.5498756170272827),
 ('marceloburlon', 0.5478085875511169),
 ('juunj', 0.5477746725082397),
 ('moscot', 0.5473437309265137),
 ('moschino', 0.5472389459609985),
 ('rimowa', 0.54493647813797),
 ('maxmara', 0

In [23]:
sorted([5,3,2,9])[:2]

[2, 3]

In [73]:
'engineeredgarment' in d2v.docvecs

False