In [1]:
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
from nltk.corpus import stopwords
import psycopg2
import pandas as pd
import os
import dill
import tensorflow as tf
from tensorflow.keras.applications import mobilenet_v2
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

In [2]:
def desc_standardize(desc):
    text = desc.strip()
    linkregex=r"((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)"
    text = re.sub(linkregex,'',text)
    text = re.sub(r"[^a-zA-Z\d\s]", '', text)
    text = text.lower()
    s=set(stopwords.words('english'))
    text = ' '.join([x for x in text.split() if not x in s])
    text = ' '.join([x for x in text.split() if not bool(re.search(r'\d', x))])
    return text

In [3]:
def getimgfeatures(imgname):
    imgmodel = mobilenet_v2.MobileNetV2(input_shape=(224,224,3),weights='imagenet',include_top=False,pooling='avg')
    image_string = tf.read_file(imgname)
    image_decoded = tf.image.decode_jpeg(image_string,channels=3)
    image_resized = tf.image.resize_images(image_decoded, [224, 224])
    img = tf.expand_dims(image_resized, axis=0)
    img = tf.Session().run(img)
    image = preprocess_input(img)
    img_feat = imgmodel.predict(image)
    return img_feat

In [6]:
userdesc = 'gold necklace on a chain'
img='/home/eli/code/insight/etsy/etsyitems/images/full/0aaeab5e8c5655813cc79f5b29ca29b8cabdb3e7.jpg'
TEMP_FOLDER = '/home/eli/code/insight/etsy/etsyitems/nlp3'
wd = '/home/eli/code/insight/etsy/etsyitems/'

In [7]:
d2vmodel = Doc2Vec.load(os.path.join(TEMP_FOLDER, 'necklaces.d2v'))
text = desc_standardize(userdesc)
desc_vec=d2vmodel.infer_vector(text, steps=40, alpha=0.025)
X_tr_d2v = np.load('d2v-features.npy')
d2v_cossim = cosine_similarity(X_tr_d2v,desc_vec.reshape(1, -1))
X_img = np.load('img_features.npy')
img_feat = getimgfeatures(img)
img_cossim = cosine_similarity(X_img,img_feat.reshape(1, -1))
eps = 1e-15

In [8]:
img_cossim.mean()

0.4166495769430252

In [9]:
df = pd.read_json('cleandf.json')
np.putmask(d2v_cossim, d2v_cossim>1-eps,-1.)
np.putmask(img_cossim, img_cossim>1-eps,-1.)
fullindd2v = np.argpartition(d2v_cossim, -5,axis=0)[-5:,0]
topd2vsim = d2v_cossim[fullindd2v].T[0]
prices_d2vsim=df['price'].values[fullindd2v]
fullindimg = np.argpartition(img_cossim, -5,axis=0)[-5:,0]
topimgsim = img_cossim[fullindimg].T[0]
prices_imgsim=df['price'].values[fullindimg]
X = np.concatenate([topd2vsim.T,prices_d2vsim.T,topimgsim.T,prices_imgsim.T]).reshape(1,-1)
rfmodel =  dill.load(open(wd + "rfimgd2vmodel.obj","rb"))



In [12]:
import sklearn
sklearn.__version__

'0.20.2'

In [27]:
rfmodel.predict(X)

array([120.79947118])

In [35]:
prices_d2vsim>rfmodel.predict(X)

array([False, False,  True, False, False])

In [28]:
(prices_d2vsim.max()-prices_d2vsim.min())/2

64.5

In [29]:
prices_imgsim

array([38.  , 17.97, 17.97, 17.98, 17.97])

In [30]:
np.min([(prices_imgsim.max()-prices_imgsim.min())/2,(prices_d2vsim.max()-prices_d2vsim.min())/2])

10.015

In [31]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [36]:
import nltk
fdist = nltk.FreqDist(flatten([txt.split() for txt in df['adesc'].values[fullindd2v[prices_d2vsim>rfmodel.predict(X)]]]))

In [104]:
res = 5.0
rng = 4.0
"%.2f" % (res-rng/2.0)

'3.00'

In [38]:
for word, frequency in fdist.most_common(5):
    if not word in userdesc:
        print(u'{};{}'.format(word, frequency))

wrapped;2
beautiful;2
watermelon;2
tourmaline;2


In [39]:
imgsim = np.load('img_cossim.npy')

In [61]:
modimgsim = imgsim[imgsim.any(axis=1)][:,imgsim.any(axis=0)]

In [97]:
modimgsim.std()

0.08556144698489522

In [98]:
imgsim.mean()-imgsim.std()

0.306601088262517

In [88]:
imgsim.std()

0.0954301662580221

In [50]:
imgsim.mean(axis=1).shape

(6975,)

In [47]:
np.concatenate([topd2vsim.T,prices_d2vsim.T,topimgsim.T,prices_imgsim.T]).reshape(1,-1).shape

(1, 20)

In [13]:
d2v_cossim.T

array([[-0.05196977, -0.03922364,  0.07207116, ..., -0.01276831,
         0.06640133,  0.00231631]], dtype=float32)

In [20]:
np.argpartition(d2v_cossim.T, -5,axis=1)[0,-5:]

array([3478, 2330, 4887,  616, 5319])

In [22]:
np.argpartition(d2v_cossim, -5,axis=0)[-5:,0]

array([3478, 2330, 4887,  616, 5319])

In [30]:
topd2vsim = d2v_cossim[fullindd2v].T[0]

In [31]:
topd2vsim

array([0.34305918, 0.36692557, 0.42390436, 0.43762705, 0.43762705],
      dtype=float32)

In [25]:
prices_d2vsim=df['price'].values[fullindd2v]

In [27]:
prices_d2vsim

array([ 96.  ,  45.99,  40.99,  55.  , 100.  ])