In [2]:
import os
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import multiprocessing
import gzip
import pandas as pd
import csv

In [3]:
listing_titles = pd.read_csv('./dataset/Listing_Titles.tsv', sep='\t', on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf8')

In [4]:
tagged_titles = pd.read_csv('./dataset/Train_Tagged_Titles.tsv', sep='\t', on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf8')

valid_tags = ['Accents', 'Brand', 'Character', 'Character Family', 'Closure', 'Color', 'Country/Region of Manufacture', 'Department', 'Fabric Type', 'Features', 'Handle Drop', 'Handle Style', 'Handle/Strap Material', 'Hardware Material', 'Lining Material', 'MPN', 'Material', 'Measurement, dimension', 'Model', 'Occasion', 'Pattern', 'Pocket Type', 'Product Line', 'Season', 'Size', 'Strap Drop', 'Style', 'Theme', 'Trim Material', 'Type']

tokens = tagged_titles.groupby('Record Number')['Token'].apply(list).to_dict()
tags = tagged_titles.groupby('Record Number')['Tag'].apply(list).to_dict()
texts = tagged_titles.groupby('Record Number')['Title'].apply(list).to_dict()

tokens = tagged_titles.groupby('Record Number')['Token'].apply(list).to_dict()
tags = tagged_titles.groupby('Record Number')['Tag'].apply(list).to_dict()
raw_tokenized_data = [[(tokens[i][tiddie], tags[i][tiddie]) for tiddie in range(0, len(tags[i]))] for i in range(1, len(tags) + 1)]

#Append NaN token to previous value
#ex. append "Vuitton" to "Louis" in "Louis Vuitton"
for i in range(0, len(raw_tokenized_data)):
    for j in reversed(range(1, len(raw_tokenized_data[i]))):
        if (raw_tokenized_data[i][j][1] != raw_tokenized_data[i][j][1]): #python nan moment
            raw_tokenized_data[i][j - 1] = (raw_tokenized_data[i][j - 1][0] + " " + raw_tokenized_data[i][j][0], raw_tokenized_data[i][j - 1][1])

trimmed_tokenized_data = [[i for i in item if not i[1] != i[1]] for item in raw_tokenized_data]

trimmed_tokenized_data

sentences = []

for i in range(0, len(trimmed_tokenized_data)):
    name = []
    for j in range(0, len(trimmed_tokenized_data[i])):
        name.append(trimmed_tokenized_data[i][j][0].lower())
    sentences.append(name)
sentences

[['louis vuitton',
  'm40096',
  'handbag',
  'priscilla',
  'multi-color',
  'canvas',
  'multi-color',
  'canvas'],
 ['louis vuitton',
  'petit noe',
  'drawstring',
  'shoulder',
  'bag',
  'monogram',
  'leather',
  'm42226',
  '39sd442'],
 ['louis vuitton',
  'damier',
  'azur',
  'pochette',
  'bosphore',
  'shoulder',
  'bag',
  'n51112',
  'lv',
  'auth',
  'yt523'],
 ['gucci',
  'bamboo',
  '2way',
  'shoulder',
  'bag',
  'leather',
  'brown',
  'auth',
  'fm1002'],
 ['rank',
  'ab',
  'vintage',
  'gucci',
  'sherry line',
  'pvc leather',
  'clutch',
  'bag',
  'brown',
  'from',
  'japan',
  'a128'],
 ['1970s',
  'nyc',
  'bonnie cashin',
  'coach',
  'brown',
  'gray',
  'leather',
  'saddle',
  'pouch',
  'crossbody',
  'bag'],
 ['louis vuitton',
  'epi',
  'serviette fermoir',
  'business',
  'bag',
  'brown',
  'm54358',
  'lv',
  'auth',
  'gt2071'],
 ['womens', 'handbag'],
 ['chanel', 'leo lion', 'flap', 'bag', 'chevron', 'lambskin', 'medium'],
 ['gucci',
  'white',


In [5]:
def create_wordvecs(corpus, model_name):
    from gensim.models.word2vec import Word2Vec
    from gensim.models.phrases import Phrases, Phraser
    from collections import defaultdict
    
    print (len(corpus))
    

    phrases = Phrases(corpus, min_count=30, progress_per=10000)
    print ("Made Phrases")
    
    bigram = Phraser(phrases)
    print ("Made Bigrams")
    
    sentences = phrases[corpus]
    print ("Found sentences")
    word_freq = defaultdict(int)

    for sent in sentences:
        for i in sent:
            word_freq[i]+=1

    print (len(word_freq))
    
    print ("Training model now...")
    w2v_model = Word2Vec(min_count=1,
                        window=2,
                        sample=6e-5,
                        alpha=0.03,
                        min_alpha=0.0007,
                        negative=20)
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

    print(w2v_model.wv.most_similar(positive = ["chanel"]))


    os.makedirs("./trainset", exist_ok=True)
    w2v_model.wv.save_word2vec_format(f"trainset/{model_name}.txt")
create_wordvecs(sentences, "word_vecs")

10
Made Phrases
Made Bigrams
Found sentences
63
Training model now...
[('hobo', 0.2650150954723358), ('lv', 0.2151338905096054), ('auth', 0.19011564552783966), ('saddle', 0.17705972492694855), ('m40096', 0.1452275812625885), ('handbag', 0.1442415565252304), ('gray', 0.14372088015079498), ('petit noe', 0.09479113668203354), ('yt523', 0.08924536406993866), ('sherry line', 0.08247613161802292)]
