In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_json('etsynecklaces10000.json')

In [3]:
len(df)

9967

In [4]:
usdf=df[df['cur']=='USD']
len(usdf)

8257

In [5]:
def text_cleanup(text):
    text = list(set([txt.strip() for txt in text]))
    text = ' '.join(text).strip()
    linkregex = r"((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)"
    text = re.sub(linkregex, '', text)
    text = re.sub(r"[^a-zA-Z\d\s]", '', text)
    text = text.lower()
    return text
    
def remove_stopwords(text):
    s=set(stopwords.words('english'))
    text = ' '.join([x for x in text.split() if not x in s])
    return text

def remove_numbers(text):
    text = ' '.join([x for x in text.split() if not bool(re.search(r'\d', x))])
    return text

In [6]:
usdf['price'] = pd.to_numeric(usdf['price'].apply(lambda x: x.replace(',','')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [143]:
len(usdf[usdf['price']>500])

269

In [8]:
usdf['description']=usdf['description'].apply(lambda x: text_cleanup(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
usdf['desc']=usdf['description'].apply(lambda x: remove_stopwords(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
from nltk.stem.snowball import SnowballStemmer

def stem_words(text):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    text = ' '.join([stemmer.stem(x) for x in text.split()])
    return text

In [None]:
from collections import Counter

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        #(log(1/total)-max_word_len-1)*(j-i) 
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

def word_prob(word): return dictionary_US[word] / total
def words(text): return re.findall('[a-z]+', text.lower()) 
dictionary_US = Counter(words(open('/home/eli/Downloads/big.txt').read()))
max_word_length = max(map(len, dictionary_US))
total = float(sum(dictionary_US.values()))

with open('/usr/share/dict/american-english') as fdict:
    US_dict = fdict.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
US_dict = [x.strip() for x in US_dict] 

In [11]:
usdf['adesc'] = usdf['desc'].apply(lambda x: remove_numbers(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    usdf['adesc'].values, usdf['price'].values, test_size=0.25, random_state=20)

In [13]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import os
import tempfile
TEMP_FOLDER = "/home/eli/code/insight/etsy/etsyitems/nlp"
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

Folder "/home/eli/code/insight/etsy/etsyitems/nlp" will be used to save temporary dictionary and corpus.


In [14]:
X_train.shape

(6192,)

In [15]:
X_test.shape

(2065,)

In [16]:
train_texts = [text.split() for text in X_train]

In [17]:
test_texts = [text.split() for text in X_test]

# VECTOR SPACE MODEL #

In [82]:
from sklearn.preprocessing import normalize
def normvec(vec):
    normv = normalize(vec[:,np.newaxis], axis=0).ravel()
    return normv

normvec(np.array([1,1]))

array([0.70710678, 0.70710678])

In [66]:
def vec_normalize(vec):
    with np.errstate(divide='raise'):
        try:
            res = vec/np.sqrt(np.sum(np.square(vec)))
        except FloatingPointError: 
            print(vec)
            res=None
    return res

assert(np.array_equal(vec_normalize([1,0]),[1,0]))

## TFIDF ##

In [22]:
from gensim import corpora
dictionary = corpora.Dictionary(train_texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'necklaces.dict'))  # store the dictionary, for future reference
#dictionary = corpora.dictionary.Dictionary.load(os.path.join(TEMP_FOLDER, 'necklaces.dict'))
print(dictionary)

2019-01-29 14:17:45,052 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-01-29 14:17:46,018 : INFO : built Dictionary(21051 unique tokens: ['actual', 'appear', 'barn', 'beautifully', 'beige']...) from 6192 documents (total 800541 corpus positions)
2019-01-29 14:17:46,019 : INFO : saving Dictionary object under /home/eli/code/insight/etsy/etsyitems/nlp/necklaces.dict, separately None
2019-01-29 14:17:46,027 : INFO : saved /home/eli/code/insight/etsy/etsyitems/nlp/necklaces.dict


Dictionary(21051 unique tokens: ['actual', 'appear', 'barn', 'beautifully', 'beige']...)


In [24]:
corpus = [dictionary.doc2bow(text) for text in train_texts]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'necklaces.mm'), corpus)
#corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'necklaces.mm'))

2019-01-29 14:18:20,665 : INFO : storing corpus in Matrix Market format to /home/eli/code/insight/etsy/etsyitems/nlp/necklaces.mm
2019-01-29 14:18:20,666 : INFO : saving sparse matrix to /home/eli/code/insight/etsy/etsyitems/nlp/necklaces.mm
2019-01-29 14:18:20,667 : INFO : PROGRESS: saving document #0
2019-01-29 14:18:20,823 : INFO : PROGRESS: saving document #1000
2019-01-29 14:18:20,970 : INFO : PROGRESS: saving document #2000
2019-01-29 14:18:21,132 : INFO : PROGRESS: saving document #3000
2019-01-29 14:18:21,285 : INFO : PROGRESS: saving document #4000
2019-01-29 14:18:21,435 : INFO : PROGRESS: saving document #5000
2019-01-29 14:18:21,591 : INFO : PROGRESS: saving document #6000
2019-01-29 14:18:21,625 : INFO : saved 6192x21051 matrix, density=0.431% (562122/130347792)
2019-01-29 14:18:21,627 : INFO : saving MmCorpus index to /home/eli/code/insight/etsy/etsyitems/nlp/necklaces.mm.index


In [25]:
from gensim import models
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
tfidf.save(os.path.join(TEMP_FOLDER, 'necklacesmodel.tfidf'))

2019-01-29 14:18:38,066 : INFO : collecting document frequencies
2019-01-29 14:18:38,067 : INFO : PROGRESS: processing document #0
2019-01-29 14:18:38,225 : INFO : calculating IDF weights for 6192 documents and 21050 features (562122 matrix non-zeros)
2019-01-29 14:18:38,293 : INFO : saving TfidfModel object under /home/eli/code/insight/etsy/etsyitems/nlp/necklacesmodel.tfidf, separately None
2019-01-29 14:18:38,393 : INFO : saved /home/eli/code/insight/etsy/etsyitems/nlp/necklacesmodel.tfidf


In [26]:
from gensim import similarities
index = similarities.MatrixSimilarity(tfidf[corpus]) # transform corpus to Tfidf space and index it
index.save(os.path.join(TEMP_FOLDER, 'necklacestfidfsim.index'))

2019-01-29 14:18:57,498 : INFO : creating matrix with 6192 documents and 21051 features
  if np.issubdtype(vec.dtype, np.int):
2019-01-29 14:19:03,082 : INFO : saving MatrixSimilarity object under /home/eli/code/insight/etsy/etsyitems/nlp/necklacestfidfsim.index, separately None
2019-01-29 14:19:03,082 : INFO : storing np array 'index' to /home/eli/code/insight/etsy/etsyitems/nlp/necklacestfidfsim.index.index.npy
2019-01-29 14:19:03,522 : INFO : saved /home/eli/code/insight/etsy/etsyitems/nlp/necklacestfidfsim.index


In [28]:
corpus_test = [dictionary.doc2bow(text) for text in test_texts]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'necklaces-test.mm'), corpus_test)

2019-01-29 14:20:48,851 : INFO : storing corpus in Matrix Market format to /home/eli/code/insight/etsy/etsyitems/nlp/necklaces-test.mm
2019-01-29 14:20:48,852 : INFO : saving sparse matrix to /home/eli/code/insight/etsy/etsyitems/nlp/necklaces-test.mm
2019-01-29 14:20:48,852 : INFO : PROGRESS: saving document #0
2019-01-29 14:20:49,003 : INFO : PROGRESS: saving document #1000
2019-01-29 14:20:49,140 : INFO : PROGRESS: saving document #2000
2019-01-29 14:20:49,149 : INFO : saved 2065x21050 matrix, density=0.421% (183062/43468250)
2019-01-29 14:20:49,151 : INFO : saving MmCorpus index to /home/eli/code/insight/etsy/etsyitems/nlp/necklaces-test.mm.index


In [56]:
Xtraintfidf = tfidf[corpus]

In [57]:
Xtesttfidf = tfidf[corpus_test]

## DOC2VEC

In [None]:
import gensim
def read_corpus(texts, tokens_only=False):
    for i, text in enumerate(texts):
        if tokens_only:
            yield gensim.utils.simple_preprocess(' '.join(text))
        else:
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(' '.join(text)), [i])
        

train_corpus = list(read_corpus(train_texts))
test_corpus = list(read_corpus(test_texts, tokens_only=True))

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

model.build_vocab(train_corpus)

%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

model.save(os.path.join(TEMP_FOLDER, 'necklaces.d2v'))

%time X_tr = np.array([model.infer_vector(train_corpus[i].words, steps=40, alpha=0.025) for i in range(len(train_corpus))])

%time X_tst = np.array([model.infer_vector(test_corpus[i], steps=40, alpha=0.025) for i in range(len(test_corpus))])

# Model fitting and hyperparameter tuning #

In [65]:
def score_r2(y_pred, y_act):
    assert(len(y_pred)==len(y_act))
    resid = y_pred-y_act
    ssresid = np.sum(np.square(resid))
    sstot = np.sum(np.square(y_act-np.mean(y_act)))
    r2 = 1-ssresid/sstot
    return r2

def rmse(y_pred, y_act):
    assert(len(y_pred)==len(y_act))
    return np.sqrt(np.sum(np.square(y_pred-y_act))/len(y_act))

def mape(y_pred, y_act):
    assert(len(y_pred)==len(y_act))
    return 100*np.sum(np.abs((y_pred-y_act)/y_act))/len(y_act)

## kNN regression ##

In [88]:
def knn_predict(k, X, y_train, weighting=False):
    y_pred =  np.zeros(len(X))
    for i in range(len(X)):
        sims = sorted(enumerate(index[X[i]]), key=lambda item: -item[1])[:k]
        if weighting:
            weights = np.square(normvec(np.array([v for (k,v) in sims])))
            if np.array_equal(weights,np.zeros(len(weights))):
                weights=None
            y_pred[i] = np.average([y_train[k] for (k,v) in sims], weights=weights)
        else:
            y_pred[i] = np.average([y_train[k] for (k,v) in sims])
        
    return y_pred

In [84]:
def hyperparamcvknn(klist, X, y, y_train):
    results = []
    for k in klist:
        for weighting in [True]:
            y_pred = knn_predict(k, X, y_train, weighting)
            results.append((score_r2(y_pred, y),[k,weighting]))
            
    return results

In [89]:
%%time
klist = np.linspace(10,11,num=1,dtype=int)
res2  = hyperparamcvknn(klist, Xtraintfidf, y_train, y_train)

CPU times: user 7min 41s, sys: 164 ms, total: 7min 41s
Wall time: 3min 51s


In [90]:
res2

[(0.768083584597864, [10, True])]

In [74]:
res.sort(key=lambda x: x[0])

In [75]:
res

[(nan, [10, True]), (0.3118361202556046, [10, False])]

In [91]:
def score_testdata(X_test,y_test,y_train,params):
    y_pred = knn_predict(params['k'],X_test,y_train,params['weighting'])
    return score_r2(y_pred,y_test)
                   
params = {'k': 10, 'weighting': True}    
print(score_testdata(Xtesttfidf, y_test,y_train,params))

0.49881934366911485


In [130]:
def rmse_testdata(X_test,y_test,y_train,params):
    y_pred = knn_predict(params['k'],X_test,y_train,params['weighting'])
    return rmse(y_pred,y_test)
                   
params = {'k': 10, 'weighting': True}    
print(rmse_testdata(Xtesttfidf, y_test,y_train,params))

287.5711151737237


In [134]:
def mape_testdata(X_test,y_test,y_train,params):
    y_pred = knn_predict(params['k'],X_test,y_train,params['weighting'])
    return mape(y_pred,y_test)
                   
params = {'k': 10, 'weighting': True}    
print(mape_testdata(Xtesttfidf, y_test,y_train,params))

87.07408798434541


## Random Forest ##

In [96]:
from sklearn.decomposition import TruncatedSVD
from time import time
# Dimensionality reduction for tfidf
def reduce_dim_by_svd(X, ncomp):
    t0 = time()
    svd = TruncatedSVD(ncomp)
    X_res  = svd.fit_transform(X)
    print("done in %fs" % (time() - t0))
    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))
    print()
    return X_res 

In [109]:
from gensim import matutils
Xtrvec = matutils.corpus2csc(Xtraintfidf).T.toarray()
Xtestvec = matutils.corpus2csc(Xtesttfidf).T.toarray()

In [114]:
X_tr = reduce_dim_by_svd(Xtrvec, 250)

done in 15.486722s
Explained variance of the SVD step: 41%



In [115]:
X_tst = reduce_dim_by_svd(Xtestvec, 250)

done in 6.099878s
Explained variance of the SVD step: 49%



In [117]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5],
 'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200]}


In [120]:
from sklearn.ensemble import RandomForestRegressor
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 50 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_tr, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 81.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [132]:
pd.DataFrame(rf_random.cv_results_).sort_values(by='mean_test_score', ascending=False).iloc[21]['params']

{'n_estimators': 115,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': True}

In [145]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 40,
 'bootstrap': False}

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [126]:
best_random = rf_random.best_estimator_
print("R2 for training data: %0.2f\n" % best_random.score(X_tr,y_train))
print("R2 for test data: %0.2f\n" % best_random.score(X_tst,y_test))

R2 for training data: 0.97

R2 for test data: -0.10



In [133]:
y_pred = best_random.predict(X_tst)

print(rmse(y_pred,y_test))

print(mape(y_pred,y_test))

425.625745039463
523.6804610576462


## XGBoost

In [None]:
import xgboost as xgb
def get_dmatrix(X, y):
    return xgb.DMatrix(data=X,label=y)

dtrain_matrix = get_dmatrix(X_tr,y_train)
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)


In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["rmse"]
%time model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True)

In [None]:
xgb_model = xgb.XGBRegressor(**params)

In [None]:
%%time
knn=10
y_pred = np.zeros(len(testvecs))
for i in range(len(testvecs[:1])):
    sims = sorted(enumerate(index[vec]), key=lambda item: -item[1])[:knn]
    y_pred[i] = np.average([y_train[k] for (k,v) in sims], weights=np.square(vec_normalize([v for (k,v) in sims])))

In [None]:
import gensim
trainvecs = tfidf[corpus]
trainvecs = gensim.matutils.corpus2csc(trainvecs)
trainvecs.T.toarray().shape

In [None]:
y_train

In [None]:
testvecs = [tfidf[dictionary.doc2bow(doc)] for doc in docs_test]
testvecs = gensim.matutils.corpus2csc(testvecs)
testvecs.T.toarray().shape

In [None]:
from sklearn import neighbors
n_neighbors = 10


knn = neighbors.KNeighborsRegressor(n_neighbors, weights='distance')
y_ = knn.fit(trainvecs, y_train).predict(testvecs)


In [None]:
sorted(enumerate(index[testvecs[0]]), key=lambda item: -item[1])[:5]

In [None]:
np.square(vec_normalize([v for (k,v) in sorted(enumerate(index[testvecs[0]]), key=lambda item: -item[1])[:5]]))

In [None]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username/password, and connection specifics
username = 'eli'
password = 'elipgsql'     # change this
host     = 'localhost'
port     = '5432'            # default port that postgres listens on
db_name  = 'necklaces_train'

## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
print(engine.url)

In [None]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

In [None]:
traindf = pd.DataFrame({'desc' : X_train, 'price' : y_train})

In [None]:
len(X_train)

In [None]:
len(traindf)

In [None]:
traindf.to_sql('necklaces_train',engine, if_exists='replace')

In [None]:
import psycopg2

In [None]:
db_name = 'necklaces_train'
username = 'eli'

In [None]:
sql_query = """
SELECT price FROM necklaces_train WHERE Index IN (%s) 
""" % items

In [None]:
sql_query

In [None]:
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database = db_name, host='/var/run/postgresql', user = username)

# query:
items = ', '.join(str(k) for k in l)

sql_query = """
SELECT index, price FROM necklaces_train WHERE index IN (%s) 
""" % items
traindata_from_sql = pd.read_sql_query(sql_query,con)
len(traindata_from_sql)

In [None]:
traindata_from_sql

In [None]:
l

In [None]:
sims

In [None]:
weights = np.square(vec_normalize([v for (k,v) in sims]))

In [None]:
traindata_from_sql.head()

In [None]:
weightindexer for k in traindata_from_sql['index']

In [None]:
np.average([traindata_from_sql.loc[traindata_from_sql['index']==k, 'price'].values[0] for (k,v) in weightindexer],weights=[v for (k,v) in weightindexer])

In [None]:
np.average([traindata_from_sql[traindata_from_sql['index']==k, 'price'] for (k,v) in sims], weights=np.square(vec_normalize([v for (k,v) in sims])))

In [None]:
weightindexer = list(zip([k for (k,v) in sims],weights))

In [None]:
np.average(traindata_from_sql['price'].values,weights=np.square(vec_normalize([v for (k,v) in sims])))

In [None]:
%%time
knn=10
y_pred = np.zeros(len(testvecs))
for i in range(len(testvecs[:1])):
    sims = sorted(enumerate(index[vec]), key=lambda item: -item[1])[:knn]
    y_pred[i] = np.average([y_train[k] for (k,v) in sims], weights=np.square(vec_normalize([v for (k,v) in sims])))

In [None]:
y_pred

In [None]:
l = [k for (k,v) in sorted(enumerate(index[trainvecs[56]]), key=lambda item: -item[1])[1:knn+1]]

In [None]:
l

In [None]:

placeholder= '?' # For SQLite. See DBAPI paramstyle.
items = ', '.join(str(k) for k in l)
'SELECT name FROM students WHERE id IN (%s)' % items

In [None]:
text = 'gold necklace'
doc = text.split()
vec = tfidf[dictionary.doc2bow(doc)]

In [None]:
sims = sorted(enumerate(index[vec]), key=lambda item: -item[1])[:knn]

In [None]:
trainvecs = tfidf[corpus]

In [None]:
%%time
knn = 10
y_fit = np.zeros(len(y_train))
trainvecs = tfidf[corpus]
for i in range(len(trainvecs[:1])):
    sims = sorted(enumerate(index[]), key=lambda item: -item[1])[1:knn+1]
    print(list(sims))
    y_fit[i] = np.average([y_train[k] for (k,v) in sims], weights=np.square(vec_normalize([v for (k,v) in sims])))

In [None]:
[y_train[k] for (k,v) in sims]

In [None]:
np.square(vec_normalize([v for (k,v) in sims]))

In [None]:
traindata_from_sql['price'].values

In [None]:
y_fit

In [None]:
resid = y_fit[~np.isnan(y_fit)]-y_train[~np.isnan(y_fit)]

In [None]:
len(resid)

In [None]:
sstot = np.sum(np.square(y_train[~np.isnan(y_fit)]-np.mean(y_train[~np.isnan(y_fit)])))

In [None]:
ssresid = np.sum(np.square(resid))

In [None]:
r2 = 1-ssresid/sstot
r2

In [None]:
rmse = np.sqrt(np.mean(np.square(resid)))
rmse

In [None]:
(y_test[abs(resid)>1000], y_pred[abs(resid)>1000])

In [None]:
np.mean(test)

In [None]:
resid[abs(resid)>1000]

In [None]:
fig,ax = plt.subplots(figsize=(10,15))
sns.scatterplot(x=y_pred, y=resid, ax=ax)

In [None]:
np.sort(np.square(y_test-y_pred))

In [None]:
np.median(y_test)

In [None]:
np.mean(y_test)

In [None]:
np.sqrt(np.mean(np.sort(np.square(y_test-y_pred))[:-100]))

In [None]:
y_test[:10]

In [None]:
y_train[k] for (k,v) in sorted(enumerate(index[testvecs[0]]), key=lambda item: -item[1])[:5]:


In [None]:
np.mean([y_train[k] for (k,v) in sorted(enumerate(index[testvecs[0]]), key=lambda item: -item[1])[:10]])

In [None]:
y_test[0]

In [None]:
[sorted(enumerate(index[vec]), key=lambda item: -item[1])[:5] for vec in testvecs[:3]]

In [None]:
sims = index[testvecs[0]]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims[:10])

In [None]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in X_train:
    for token in text.split():
        frequency[token] += 1

texts = [[token for token in text.split() if frequency[token] > 1] for text in X_train]

from pprint import pprint  # pretty-printer
pprint(texts)

In [None]:
from nltk.probability import FreqDist

In [None]:
%%time
fdist = FreqDist(word for word in ' '.join(usdf['desc'].values).split() if not word.isnumeric())

In [None]:
fdist.pprint(maxlen=20)

In [None]:
import re
usdf['adesc'] = usdf['desc'].apply(lambda desc: ' '.join([x for x in desc.split() if not bool(re.search(r'\d', x))]))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
descs = usdf['adesc'].values

In [None]:
vectorizer = TfidfVectorizer()
tfidf =  vectorizer.fit_transform(descs)
feature_names = vectorizer.get_feature_names()
denselist = tfidf.todense().tolist()

In [None]:
vectorizer.inverse_transform(tfidf)

In [None]:
usdf[usdf.price>100]

In [None]:
tfidffeatures = pd.DataFrame(denselist, columns=feature_names)

In [None]:
s = tfidffeatures.iloc[0]
s[s>0].sort_values(ascending=False)[:10]

In [None]:
tfidf.shape

In [None]:
from sklearn.decomposition import TruncatedSVD

X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(tfidf)

In [None]:
from sklearn.manifold import TSNE

In [None]:
%%time
X_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_reduced)

In [None]:
fig, ax =plt.subplots(figsize=(10,5))
ax.set_xlim(0,10000)
usdf.price.hist(ax=ax, bins=50)

In [None]:
from matplotlib import cm
from matplotlib import colors
norm = cm.colors.Normalize(vmax=usdf.price.values.max(), vmin=usdf.price.values.min())
cmap = cm.jet

fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=usdf.price.values, marker="x", cmap=cmap, norm=norm)

In [None]:
from scipy.cluster import  hierarchy

In [None]:
#Clustering
X = tfidf.todense()
threshold = 0.1
Z = hierarchy.linkage(X,"average", metric="cosine")
#Clustering
maxclust = 20
C = hierarchy.fcluster(Z, maxclust, criterion="maxclust")
#C = hierarchy.fcluster(Z, threshold, criterion="distance")

In [None]:
len(C)

In [None]:
len(np.unique(C))

In [None]:
usdf['clusters'] = C
usdf.head()

In [None]:
from nltk import FreqDist

In [None]:
fdist_hiprice = FreqDist(word for word in ' '.join(usdf[usdf['price']>100]['adesc'].values).split() if not word.isnumeric())
fdist_hiprice.pprint(maxlen=20)

In [None]:
fig,ax = plt.subplots(figsize=(15,10)) 
plt.sca(ax)
fdist_hiprice.plot(20, cumulative=False)
fig.savefig('disthiprice.png')

In [None]:
fig,ax = plt.subplots(figsize=(15,10)) 
plt.sca(ax)
fdist_loprice.plot(20,cumulative=False)
fig.savefig('distloprice.png')

In [None]:
fdist_loprice = FreqDist(word for word in ' '.join(usdf[usdf['price']<25]['desc'].values).split() if not word.isnumeric())
fdist_loprice.pprint(maxlen=20)

In [None]:
tfidf.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
y = usdf['price'].values

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
regr_rf = RandomForestRegressor(n_estimators=20, max_depth=None,min_samples_split=2, random_state=2, oob_score=True)
regr_rf.fit(X_train, y_train)

In [None]:
y_rf = regr_rf.predict(X_test)

In [None]:
X!=tfidf

In [None]:
regr_rf.score(X_test,y_test)

In [None]:
regr_rf.oob_score_

In [None]:
np.sqrt(np.mean(np.square(y_rf-y_test)))

In [None]:
closeby = np.zeros(len(usdf))
eps = 1.0
for index, row in usdf.iterrows():
    if(index<len(usdf)-2):
        i2 = index+1
        row2=usdf.iloc[i2]
        while(row['clusters']==row2['clusters']):
            #print(row[['price','title']], row2[['price','title']])
            if (np.abs(float(row['price'])-float(row2['price']))<eps):
                closeby[i2]=1
            if(i2<len(usdf)-1): 
                i2+=1
                row2=usdf.iloc[i2]
        

In [None]:
np.sum(closeby)

In [None]:
len(C)

In [None]:
len(np.unique(C))

In [None]:
len(C)-len(np.unique(C))

In [None]:
usdf.reset_index(drop=True)

In [None]:
usdf.groupby('clusters')['price'].std()

### Lets have 20 clusters

In [None]:
#Clustering
maxclust = 20
C20 = hierarchy.fcluster(Z, maxclust, criterion="maxclust")

In [None]:
np.unique(C20)

In [None]:
usdf['c20'] = C20
usdf.head()

In [None]:
usdf['overview'].apply(lambda x: print(x))

In [None]:
'vintage' in ' '.join(['\n    Vintage item\n', '\n    Favorited by: ', '\n', '\n        Gift wrapping and message available\n        ', '\n    ']).lower()

In [None]:
import re
re.findall(r'\d{4}',' '.join(['\n    Vintage item \n', '\n    Favorited by: ', '\n', '\n        Gift wrapping and message available\n        ', '\n    ']))

In [None]:
usdf['vintage'] = usdf['overview'].apply(lambda x: int('vintage' in ' '.join(x).lower()))

In [None]:
usdf['year'] = usdf['overview'].apply(lambda x: re.findall(r'\d{4}',' '.join(x).lower())[0] if(len(re.findall(r'\d{4}',' '.join(x).lower()))>0) else np.nan)

In [None]:
usdf['handmade'] = usdf['overview'].apply(lambda x: int('handmade' in ' '.join(x).lower() or 'hand-made' in ' '.join(x).lower()))

In [None]:
def extract_materials(ov):
    l = [re.sub(r"[^a-zA-Z\d\s]", '', x).lower().replace('materials','').replace('material','').strip() for x in ov if 'materials' in x.lower() or 'material' in x.lower()] 
    return l[0] if len(l)>0 else np.nan
    
usdf['materials'] = usdf['overview'].apply(extract_materials)

In [None]:
usdf.columns

In [None]:
usdf.head()