Tues 9/18/18 Working with Etsy Database

Tables:

listings
tags
materials
styles
tags
categories
category_ids

In [1]:
%matplotlib inline
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import r2_score
import psycopg2
import pandas as pd
pd.set_option('display.max_columns', 50)

Database definitions and psycopg2 connection

In [2]:
username = 'postgres'
password = 'jlmlcook'
host     = 'localhost'
port     = '5432'
db_name  = 'etsy'

con = None
con = psycopg2.connect(user = username, password = password, host = host, port = port, database = db_name)

Interesting Query Info

Items, Tags Per Category:

Quilts 74761, 89845

Clothing 424994, 486624

Jewelry 110937, 189084

Shoes 54537, 71316

Hat/Hats 36570, 58133 - more reduced = 10539 with count_vectorizer

Get all listing ids for categories 'Hat' or 'Hats'

In [11]:
#Get relevant listing ids
sql_query = None
sql_query = """
SELECT COUNT(DISTINCT listings.listing_id)
FROM listings
FULL JOIN categories ON listings.listing_id=categories.listing_id
WHERE categories.category = 'Hat' or categories.category = 'Hats'
"""

print(pd.read_sql_query(sql_query,con))

   count
0  36570


In [None]:
#Get relevant listing ids
sql_query = None
sql_query = """
SELECT DISTINCT listings.listing_id
FROM listings
FULL JOIN categories ON listings.listing_id=categories.listing_id
WHERE category = 'Hat' OR category = 'Hats'
"""

id_list = pd.read_sql_query(sql_query,con)['listing_id'].tolist()

Query the database for all listings, tags, categories, materials, and styles for the listing ids for 'Hat' or 'Hats'

In [None]:
column_names = ['listing_id','price','views','num_favorers','original_creation_tsz','ending_tsz']

listings = []
tags = []
categories = []
materials = []
styles = []

with tqdm(total=len(id_list), file=sys.stdout) as pbar:
    #id_list = [100000914]
    for i, listing_id in enumerate(id_list):
        pbar.set_description('processed: %d' % (1 + i))
        pbar.update(1)
        # Get listing table info for this ID
        sql_query = str('SELECT * FROM listings WHERE listings.listing_id='+str(listing_id))
        listings.append(pd.read_sql_query(sql_query,con)[column_names].copy(deep=True))       
        #listing = pd.read_sql_query(sql_query,con)[column_names].copy(deep=True)
    
        # Get tags table info for this ID
        sql_query = str('SELECT * FROM tags WHERE tags.listing_id='+str(listing_id))
        tags.append(" ".join(pd.read_sql_query(sql_query,con)['tag']).lower())
        #tags = list(set(" ".join(pd.read_sql_query(sql_query,con)['tag'].tolist()).lower().split()))
    
        # Get categories table info for this ID
        sql_query = str('SELECT * FROM categories WHERE categories.listing_id='+str(listing_id))
        categories.append(" ".join(pd.read_sql_query(sql_query,con)['category']).lower())
        #categories = list(set(" ".join(pd.read_sql_query(sql_query,con)['category'].tolist()).lower().split()))
    
        # Get materials table info for this ID
        sql_query = str('SELECT * FROM materials WHERE materials.listing_id='+str(listing_id))
        materials.append(" ".join(pd.read_sql_query(sql_query,con)['material']).lower())
        #materials = list(set(" ".join(pd.read_sql_query(sql_query,con)['material'].tolist()).lower().split()))
    
        # Get styles table info for this ID
        sql_query = str('SELECT * FROM styles WHERE styles.listing_id='+str(listing_id))
        styles.append(" ".join(pd.read_sql_query(sql_query,con)['style']).lower())
        #styles = list(set(" ".join(pd.read_sql_query(sql_query,con)['style'].tolist()).lower().split()))

Concatenate all listings dataframes into a single dataframe

In [None]:
result = pd.concat(listings, ignore_index=True)

In [None]:
print(result.shape)
print(len(tags))
print(len(categories))
print(len(materials))
print(len(styles))

In [None]:
for i, x in enumerate(styles):
    if x:
        print(i,x)

Combine all named features into a single list of lists

In [None]:
all_features = []
for i in range(len(tags)):
    all_features.append(tags[i]+" "+materials[i]+" "+styles[i])

In [None]:
all_features[21]

Create a countVectorizer for all named features

In [None]:
vectorizer = CountVectorizer(binary=True)
all_features_vector = vectorizer.fit_transform(all_features)
all_features_names = vectorizer.get_feature_names()

Save listings dataframe and features to pickle files for faster reload

In [None]:
result.to_pickle("./listings.pkl")

In [None]:
result.to_pickle("./listings.pkl")

with open('all_features.pickle', 'wb') as handle:
    pickle.dump(all_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

Open listings dataframe and features from pickle files for faster reload

In [None]:
with open('listings.pkl', 'rb') as handle:
    listings = pickle.load(handle)

with open('all_features.pickle', 'rb') as handle:
    all_features = pickle.load(handle)

In [None]:
vectorizer = CountVectorizer(binary=True)
all_features_vector = vectorizer.fit_transform(all_features)
all_features_names = vectorizer.get_feature_names()

In [None]:
all_features_matrix = all_features_vector.toarray()

In [None]:
test = vectorizer.transform(['hat', 'crochet', 'baby', 'yarn'])
print ('Shape of Sparse Matrix: ', test.shape)
print ('Amount of Non-Zero occurences: ', test.nnz)
print ('sparsity: %.2f%%' % (100.0 * test.nnz /
                             (test.shape[0] * test.shape[1])))

In [None]:
prices = np.nan_to_num(np.array(listings['price'].tolist()))

In [None]:
np.isnan(prices).any()

In [None]:
n_samples = all_features_matrix.shape[0]
train_fraction = int(n_samples*.8)
X_train, y_train = all_features_matrix[:train_fraction], prices[:train_fraction]
X_test, y_test = all_features_matrix[train_fraction:], prices[train_fraction:]

In [None]:
print(n_samples)
print(len(X_train))
print(len(X_test))

In [None]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print(r2_score(y_test, y_pred))

In [None]:
clf = linear_model.ElasticNet(alpha=0.1)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print(r2_score(y_test, y_pred))