In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
pd.options.mode.chained_assignment = None  # default='warn'

In [9]:
import numpy as np

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/gracewu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gracewu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]  

def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed. 

    Args:
        column: Pandas dataframe column of tokenized data from tokenize()

    Returns:
        tokens (list): Tokenized list with stopwords removed.

    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

def apply_stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.

    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.

    Returns:
        tokens (list): Tokenized list with words Porter stemmed.

    """

    stemmer = PorterStemmer() 
    return [stemmer.stem(word) for word in tokenized_column]

def rejoin_words(tokenized_column):
    """Rejoins a tokenized word list into a single string. 
    
    Args:
        tokenized_column (list): Tokenized column of words. 
        
    Returns:
        string: Single string of untokenized words. 
    """
    
    return ( " ".join(tokenized_column))

In [37]:
df1 = pd.read_csv("./amazon1.csv")
cols1 = ['Uniq Id', 'Product Name', 'Category', 'About Product', 'Image']
prod_df1 = df1[cols1]

In [None]:
df1.dropna()
for j in range(1, len(cols1)-1):
    df1['tokenized'] = df1.apply(lambda x: tokenize(str(x[cols1[j]])), axis=1)
    df1['stopwords_removed'] = df1.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
    df1['porter_stemmed'] = df1.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
    df1['rejoined'] = df1.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)
    df1[cols1[j]] = df1['rejoined']
    df1 = df1.drop(['tokenized','stopwords_removed', 'porter_stemmed', 'rejoined'], axis=1)


In [41]:
df1["About Product"] = df1.apply(lambda x: rejoin_words(tokenize(str(x['About Product']))[6:]), axis=1) 

## Getting images for just the first dataset

In [42]:
df1['Image'] = df1.apply(lambda x: str(x['Image']).split('|')[:-1], axis=1) 


In [43]:
df1 = df1.explode("Image", ignore_index=True)

In [44]:
df1['Image'] = df1['Image'].astype(str)

In [10]:
from PIL import Image
import requests
from io import BytesIO

In [21]:
df1.index.name = 'ID'

In [23]:
exception = []

In [24]:
for i in range(0, len(df1)):
    url = df1["Image"][i]
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img = img.resize((512,512))
        img.save("./amazon_img/" + str(i) + ".jpg")
    except:
        exception += [i]

In [36]:
len(df1)

33993

In [46]:
'''
from pathlib import Path

exception = []
for i in range(0, len(prod_df[0])):
    my_file = Path("./amazon_img/" + str(i) + '.jpg')
    my_file2 = Path("./toy_img/" + str(i) + '.jpg')
    if not my_file.is_file() and not my_file2.is_file():
        exception += [i]
'''

In [47]:
len(exception)

249

In [48]:
df1 = df1.drop(exception)

In [49]:
len(df1)

33744

In [50]:
df1['text'] = df1[["Product Name", "Category", "About Product" ]].apply(" ".join, axis=1)

Concatenate text column, make categories column a list so we can filter it

In [21]:
df1[cols1[2]] = df1.apply(lambda x: tokenize(str(x[cols1[2]])), axis=1)
df1['temp'] = df1.apply(lambda x: rejoin_words(x[cols1[2]]), axis=1)
df1['text'] = df1[1:4].apply(" ".join, axis=1)
df1 = df1.drop(['temp'], axis=1)

In [54]:
# Example code for filtering dataset based on category
filter_cat = "toy"
mask = df1["Category"].apply(lambda x: filter_cat in x)
masked = df1[mask]

In [55]:
len(masked)

23024