In [1]:
# imports
import re
from collections import defaultdict

import nltk
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/oliver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/oliver/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# constants
products_dtype_dict = {
    "productUrl": np.str,
    "productName": np.str,
    "productCategory": np.str,
    "productRating": np.float64,
    "productReviews": np.int32,
    "productPriceSale": np.float64,
    "productPriceOriginal": np.float64,
    "productShop": np.str
}

shops_dtype_dict = {
    "shopName": np.str,
    "noOfItems": np.float64,
    "shopOwner": np.str,
    "shopAddress": np.str,
    "noOfSales": np.int32,
    "shopImageLink": np.str,
    "yearCreated": np.int32
}

shop_owners_dtype_dict = {
    "shopName": np.str,
    "shopOwnerName": np.str,
    "ownerFollowers": np.int32,
    "ownerFollowing": np.int32,
    "ownerLocation": np.str
}

columns_normalize = [
    'productPriceOriginal',
    'productPriceSale',
    'productRating',
    'productReviews',
    'noOfItems',
    'ownerFollowers',
    'ownerFollowing',
]

features = [
    'productUrl',
    'productCategory',
    'productName',
    'productPriceOriginal',
    'productPriceSale',
    'productRating',
    'productReviews',
    'noOfItems',
    'ownerFollowers',
    'ownerFollowing',
]

In [3]:
# functions
def process_review_count(row_string):
    row_string = str(row_string)
    row_string = row_string.replace('(', '')
    row_string = row_string.replace(')', '')
    row_string = row_string.replace(',', '')
    return row_string

def text_cleaner(text):
    stop_words = set(stopwords.words('english'))
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    tokens = [w for w in newString.split() if not w in stop_words]
    long_words=[]
    for i in tokens:
        if len(i)>=3:
            long_words.append(i)
    return (" ".join(long_words)).strip()

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = word_tokenize(text)
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1

    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

In [4]:
products = pd.read_csv('../data/products_final.csv')
# small number of missing values so dropping won't affect the data
# additionally our further processing will require the number of reviews
# and the rating of the product
print(products.isna().sum())
products.dropna(inplace=True)
print(products.isna().sum())

products.productPriceOriginal = products.apply(lambda x: x.productPriceOriginal.replace(',', ''), axis=1)
products.productPriceSale = products.apply(lambda x: x.productPriceSale.replace(',', ''), axis=1)
products.productReviews = products.apply(lambda x: process_review_count(x.productReviews), axis=1)

products = products.astype(products_dtype_dict)

productCategory           0
productName               0
productPriceOriginal      0
productPriceSale          0
productRating           930
productReviews          930
productShop               0
productUrl                0
dtype: int64
productCategory         0
productName             0
productPriceOriginal    0
productPriceSale        0
productRating           0
productReviews          0
productShop             0
productUrl              0
dtype: int64


In [5]:
owners = pd.read_csv('../data/shop_owners_final.csv')
print(owners.isna().sum())
owners.dropna(inplace=True)
print(owners.isna().sum())

owners.ownerFollowing = owners.apply(lambda x: x.ownerFollowing.replace(',', ''), axis=1)
owners.ownerFollowers = owners.apply(lambda x: x.ownerFollowers.replace(',', ''), axis=1)

owners = owners.astype(shop_owners_dtype_dict)

ownerFollowers      0
ownerFollowing      0
ownerLocation     170
shopName          178
shopOwnerName       0
dtype: int64
ownerFollowers    0
ownerFollowing    0
ownerLocation     0
shopName          0
shopOwnerName     0
dtype: int64


In [6]:
shops = pd.read_csv('../data/shops_final.csv')
shops = shops.fillna(value="-1")

shops.noOfSales = shops.apply(lambda x: x.noOfSales.replace(',', ''), axis=1)

shops = shops.astype(shops_dtype_dict)

In [7]:
shops_with_owners = pd.merge(shops, owners, how='inner', left_on='shopName', right_on='shopName')
shops_with_owners.head()

Unnamed: 0,noOfItems,noOfSales,shopAddress,shopImageLink,shopName,shopOwner,yearCreated,ownerFollowers,ownerFollowing,ownerLocation,shopOwnerName
0,1974.0,3753,-1,https://i.etsystatic.com/isla/8d68ec/33669914/...,LeftoverStuff,Kathy,2008,334,893,"Gosport IN, United States",Kathy
1,1649.0,4946,"Perry Park, Kentucky",https://i.etsystatic.com/isla/6e9a83/23009003/...,HazelCatkins,Hazel Catkins,2006,70,61,"Perry Park, KY, United States",Hazel Catkins
2,185.0,4295,-1,https://i.etsystatic.com/isla/7ae0af/35870117/...,LilylongtoothToys,DogboneArt,2006,55,0,"holland township, nj, United States",DogboneArt
3,157.0,-1,"Montreal, Canada",https://i.etsystatic.com/isla/eadddc/38741905/...,neawear,Janick,2006,568,129,"Montreal, QC, Canada",Janick
4,35.0,-1,"California, United States",https://i.etsystatic.com/isla/6ce016/18552484/...,amiamour,amiamour,2006,590,13,"California, United States",amiamour


In [8]:
data = pd.merge(products, shops_with_owners, how='inner', left_on='productShop', right_on='shopName')
data = data[features]
data.isna().sum()

productUrl              0
productCategory         0
productName             0
productPriceOriginal    0
productPriceSale        0
productRating           0
productReviews          0
noOfItems               0
ownerFollowers          0
ownerFollowing          0
dtype: int64

In [9]:
scaler = MinMaxScaler()
data[columns_normalize] = scaler.fit_transform(data[columns_normalize])

le = LabelEncoder()
data.productCategory = le.fit_transform(data.productCategory)

In [10]:
data['isOnSale'] = data.apply(lambda x: 1 if x.productPriceSale > 0 else 0, axis=1)

In [18]:
data.to_csv('../data/combined_data_for_jana.csv')
data.head()

Unnamed: 0,productUrl,productCategory,productName,productPriceOriginal,productPriceSale,productRating,productReviews,noOfItems,ownerFollowers,ownerFollowing,isOnSale,productNameProcessed
0,https://www.etsy.com/listing/475757830/stackab...,4,"Stackable Name Ring, dainty name ring, persona...",0.014372,0.0,0.97075,0.07075,0.004885,0.009943,0.146296,0,stackable name ring dainty name ring personali...
1,https://www.etsy.com/listing/83420567/stacking...,4,"Stacking Ring, custom made silver ring persona...",0.014372,0.0,0.97075,0.07075,0.004885,0.009943,0.146296,0,stacking ring custom made silver ring personal...
2,https://www.etsy.com/listing/168092030/word-ch...,4,"Word Charms, hand stamped jewelry, personalize...",0.008473,0.0,0.97075,0.07075,0.004885,0.009943,0.146296,0,word charms hand stamped jewelry personalized ...
3,https://www.etsy.com/listing/249246208/minimal...,4,Minimalist Black And Gold Earrings Gold Dangle...,0.018662,0.0,0.971985,0.014374,0.005061,0.004306,0.253704,0,minimalist black gold earrings gold dangle ear...
4,https://www.etsy.com/listing/490120699/silver-...,4,Silver Stud Earrings Sapphire Earrings Septemb...,0.017053,0.0,0.971985,0.014374,0.005061,0.004306,0.253704,0,silver stud earrings sapphire earrings septemb...


In [12]:
# NAME NLP
contraction_mapping = {
    "ain't": "is not","aren't": "are not","can't": "cannot","'cause": "because","could've": "could have","couldn't": "could not",
    "didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hasn't": "has not","haven't": "have not",
    "he'd": "he would","he'll": "he will","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","i'd": "i would",
    "i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have","isn't": "is not","it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
    "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
    "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
    "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
    "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
    "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
    "you're": "you are", "you've": "you have"
}

In [13]:
data['productNameProcessed'] = data.apply(lambda x: text_cleaner(x.productName), axis=1)

In [14]:
matrix = co_occurrence(data.productNameProcessed, 3)

In [15]:
vocab = matrix.index.tolist()
n = len(vocab)
svd = TruncatedSVD(n_components=50, n_iter=10, random_state=42)
reduced_matrix = svd.fit_transform(matrix)

In [16]:
reduced_matrix.shape

(5203, 50)

In [17]:
m_scale = MinMaxScaler()
reduced_matrix = m_scale.fit_transform(reduced_matrix)

df = pd.DataFrame(data=reduced_matrix, index=vocab, columns=[i for i in range(50)])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
aaa,1.576406e-05,0.425884,0.084931,0.226367,0.489225,0.289486,0.197898,0.287899,0.414669,0.263883,...,0.467368,0.486797,0.504283,0.414087,0.448050,0.499439,0.492157,0.396598,0.393797,0.417032
abalone,2.272307e-05,0.425857,0.082636,0.226689,0.489217,0.291059,0.192947,0.280184,0.414855,0.264728,...,0.483268,0.482093,0.487881,0.412501,0.450634,0.498266,0.496306,0.399308,0.386591,0.413224
abc,1.647746e-05,0.425892,0.083037,0.232623,0.489207,0.288918,0.193758,0.281003,0.414760,0.264046,...,0.455705,0.473090,0.502225,0.410894,0.452893,0.497804,0.490860,0.401719,0.391319,0.398406
abduction,7.116110e-09,0.425891,0.078959,0.227280,0.489207,0.290963,0.193508,0.280579,0.414733,0.264892,...,0.480516,0.483080,0.495084,0.411053,0.452278,0.498672,0.493684,0.400651,0.391661,0.405872
abraham,7.792486e-08,0.425891,0.078973,0.227287,0.489207,0.290986,0.193511,0.280584,0.414731,0.264851,...,0.480549,0.483143,0.494936,0.411016,0.451845,0.498725,0.493691,0.401207,0.391882,0.405415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zircon,6.083798e-06,0.425890,0.085788,0.225498,0.489208,0.289151,0.192127,0.278474,0.414703,0.267166,...,0.485065,0.481440,0.492300,0.410016,0.447338,0.498772,0.492248,0.393258,0.392325,0.407903
zjz,9.487501e-06,0.425879,0.079477,0.227391,0.489213,0.291898,0.193233,0.280557,0.414769,0.263968,...,0.481417,0.484291,0.494800,0.410873,0.453434,0.499080,0.493580,0.402179,0.393012,0.408208
zombicorn,2.071358e-06,0.425892,0.080601,0.231284,0.489208,0.289199,0.193349,0.280279,0.414732,0.265242,...,0.481729,0.483471,0.497236,0.409289,0.452537,0.495373,0.484139,0.401246,0.392881,0.404905
zombie,1.775365e-05,0.425873,0.082361,0.229167,0.489219,0.289614,0.194342,0.281698,0.414855,0.264274,...,0.478787,0.479639,0.493147,0.410024,0.449010,0.497602,0.494166,0.407566,0.392002,0.400758
