# __Walmart - Data Cleaning and Tagging__

# Setup

In [2]:
#import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import KNNImputer

import os

from scipy.sparse import coo_matrix

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

## Import the dataset

In [3]:
data = pd.read_csv('walmart_clothing_data.csv')
data.head()

Unnamed: 0,Uniq Id,User Id,Crawl Timestamp,Product Url,Product Name,Description,List Price,Sale Price,Brand,Item Number,Gtin,Package Size,Category,Postal Code,Available,Rating,Rating Count
0,bc645d7939917c8c8a51cab46ac93ce3,10001,2019-12-19 06:08:33 +0000,https://www.walmart.com/ip/Champro-Adult-First...,Champro Adult First Down Two-Tone Football Jer...,We aim to show you accurate product informati...,25.86,25.86,Champro,,760853595,,Sports & Outdoors | Sports Fan Shop | Clothing...,,True,2.9,2.0
1,65e53225e9ed8cacfd17c81a41c84335,10141,2019-12-19 07:27:21 +0000,https://www.walmart.com/ip/Adoretex-Men-s-Guar...,Adoretex Mens Guard Swim Board Short Swim Trun...,We aim to show you accurate product informati...,26.39,26.39,Adoretex,,293242353,,Clothing | Mens Clothing | Mens Swimwear | Men...,,True,3.9,63.0
2,fce147ac19d2f32a0179d9f883644b91,10074,2019-12-19 08:47:05 +0000,https://www.walmart.com/ip/Alex-And-Ani-Caroli...,Alex And Ani Carolina Panthers Football Charm ...,We aim to show you accurate product informati...,12.6,12.6,Alex and Ani,576235964.0,317067830,,Sports & Outdoors | Sports | Football Gear & E...,,True,1.0,1.0
3,572ab19ce683fbdfb2ee2b9512a48951,10182,2019-12-18 23:12:03 +0000,https://www.walmart.com/ip/Tokelau-ScudoPro-Sh...,Tokelau ScudoPro Short Sleeve Cycling Jersey f...,We aim to show you accurate product informati...,42.99,42.99,ScudoPro,,135871674,,Clothing | Mens Clothing | Mens Graphic Tees,,True,5.0,1.0
4,06b646bb5475ef339047309cf5fac206,10160,2019-12-19 06:42:15 +0000,https://www.walmart.com/ip/Men-s-White-Knit-Pu...,Mens White Knit Pullover Golf Polo Shirt - Large,We aim to show you accurate product informati...,6.49,6.49,Christmas Central,,278242398,,Sports & Outdoors | Sports | Golf Equipment | ...,,True,0.0,0.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 949 entries, 0 to 948
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Uniq Id          949 non-null    object 
 1   User Id          949 non-null    int64  
 2   Crawl Timestamp  949 non-null    object 
 3   Product Url      949 non-null    object 
 4   Product Name     949 non-null    object 
 5   Description      949 non-null    object 
 6   List Price       949 non-null    float64
 7   Sale Price       949 non-null    float64
 8   Brand            943 non-null    object 
 9   Item Number      191 non-null    float64
 10  Gtin             949 non-null    int64  
 11  Package Size     0 non-null      float64
 12  Category         949 non-null    object 
 13  Postal Code      0 non-null      float64
 14  Available        949 non-null    bool   
 15  Rating           949 non-null    float64
 16  Rating Count     949 non-null    float64
dtypes: bool(1), floa

# Preprocessing

## Handle missing values

In [5]:
# Check for missing values
data.isna().sum()

Uniq Id              0
User Id              0
Crawl Timestamp      0
Product Url          0
Product Name         0
Description          0
List Price           0
Sale Price           0
Brand                6
Item Number        758
Gtin                 0
Package Size       949
Category             0
Postal Code        949
Available            0
Rating               0
Rating Count         0
dtype: int64

In [6]:
# Missing values as percentages
round(data.isna().sum() / data.shape[0] * 100, 2)

Uniq Id              0.00
User Id              0.00
Crawl Timestamp      0.00
Product Url          0.00
Product Name         0.00
Description          0.00
List Price           0.00
Sale Price           0.00
Brand                0.63
Item Number         79.87
Gtin                 0.00
Package Size       100.00
Category             0.00
Postal Code        100.00
Available            0.00
Rating               0.00
Rating Count         0.00
dtype: float64

In [7]:
# Remove columns with more than 75% missing values
data_drp_cols = data.copy()

data_drp_cols = data_drp_cols.drop(['Item Number', 'Package Size', 'Postal Code', 'Crawl Timestamp', 'Gtin'], axis=1)

In [8]:
data_drp_cols.isna().sum()

Uniq Id         0
User Id         0
Product Url     0
Product Name    0
Description     0
List Price      0
Sale Price      0
Brand           6
Category        0
Available       0
Rating          0
Rating Count    0
dtype: int64

## Data Imputation

In [9]:
# Impute the missing values in Brand column
# Product names of samples with missing Brand value
data_drp_cols[data_drp_cols['Brand'].isna()]['Product Name']

267    Flying Fisherman Short Sleeve Traveler Tee, Gr...
282              Adams 512 1-1/5â€ Football Belt Purple
380    Flying Fisherman Short Sleeve Traditions Tee, ...
538                   Adams 510 1â€ Football Belt Kelly
842    Caddis Systems Promo Youth Breathable Stocking...
843    CARLTON GLOBAL Fishing Wader Boot Hanger Strap...
Name: Product Name, dtype: object

In [10]:
# Function to get the Brand from Product Name
def get_brand_from_name(product_name):
    target_words = product_name.split()[:2]
    imputed_name = []
    for word in target_words:
        if word.isalpha():
            imputed_name.append(word)
    return ' '.join(imputed_name).strip()

data_imputed = data_drp_cols.copy()
names = data_imputed[data_imputed['Brand'].isna()]['Product Name'].apply(get_brand_from_name)
data_imputed.loc[data_imputed[data_imputed['Brand'].isna()].index, 'Brand'] = data_imputed[data_imputed['Brand'].isna()]['Product Name'].apply(get_brand_from_name)

In [11]:
# Before imputing
data_drp_cols[data_drp_cols['Brand'].isna()].head(3)

Unnamed: 0,Uniq Id,User Id,Product Url,Product Name,Description,List Price,Sale Price,Brand,Category,Available,Rating,Rating Count
267,98c291fb7538989c491e6740adcbf06c,10108,https://www.walmart.com/ip/Flying-Fisherman-Tr...,"Flying Fisherman Short Sleeve Traveler Tee, Gr...",We aim to show you accurate product informati...,11.76,11.76,,Sports & Outdoors | Outdoor Sports | Fishing |...,True,3.8,7.0
282,f383f2b8b92749656062ab43b0a06a17,10035,https://www.walmart.com/ip/Adams-512-1-1-5-Foo...,Adams 512 1-1/5â€ Football Belt Purple,We aim to show you accurate product informati...,19.76,19.76,,Sports & Outdoors | Sports | Football Gear & E...,True,3.9,11.0
380,38daddb3ba47079deb2870fb5c885d1a,10004,https://www.walmart.com/ip/Flying-Fisherman-Tr...,"Flying Fisherman Short Sleeve Traditions Tee, ...",We aim to show you accurate product informati...,14.74,14.74,,Sports & Outdoors | Outdoor Sports | Fishing |...,True,1.0,1.0


In [12]:
# After imputing
data_imputed.loc[data_drp_cols[data_drp_cols['Brand'].isna()].index].head(3)

Unnamed: 0,Uniq Id,User Id,Product Url,Product Name,Description,List Price,Sale Price,Brand,Category,Available,Rating,Rating Count
267,98c291fb7538989c491e6740adcbf06c,10108,https://www.walmart.com/ip/Flying-Fisherman-Tr...,"Flying Fisherman Short Sleeve Traveler Tee, Gr...",We aim to show you accurate product informati...,11.76,11.76,Flying Fisherman,Sports & Outdoors | Outdoor Sports | Fishing |...,True,3.8,7.0
282,f383f2b8b92749656062ab43b0a06a17,10035,https://www.walmart.com/ip/Adams-512-1-1-5-Foo...,Adams 512 1-1/5â€ Football Belt Purple,We aim to show you accurate product informati...,19.76,19.76,Adams,Sports & Outdoors | Sports | Football Gear & E...,True,3.9,11.0
380,38daddb3ba47079deb2870fb5c885d1a,10004,https://www.walmart.com/ip/Flying-Fisherman-Tr...,"Flying Fisherman Short Sleeve Traditions Tee, ...",We aim to show you accurate product informati...,14.74,14.74,Flying Fisherman,Sports & Outdoors | Outdoor Sports | Fishing |...,True,1.0,1.0


## Clean the description

In [30]:
# Function to clean the description
def clean_description(des, des_nlp):
    """ 
    Remove unnessary parts from the description, 
    concatenate tokens seperated by '-' to create new compact words,
    Seperate words that are worngly concatenated.

    Args
        des: description of the sample

    Return
        str: cleaned description
    """
    # Remove the unnessasary part
    des = des.split('|')[1:]
    des = ' '.join(des).strip()
    
    # Seperate wrongly concatenated words
    sep_str = ''
    for token in des_nlp(des):
        text = token.text
        if text.isalnum() and len(text) > 1 and not text[1:].islower() and not text[1:].isupper():
            for idx, char in enumerate(text):
                if char.isupper():
                    modded_tokens = ' '.join([text[:idx], text[idx:]])
                    sep_str = ' '.join([sep_str, modded_tokens])
        else:
            sep_str = ' '.join([sep_str, text])

    # Concatenate tokens seperated by '-' to create compat words
    word_list = []
    words_to_remove = []
    for idx, token in enumerate(des_nlp(sep_str)):
        text = token.text
        if text == '-':
            mx = max(enumerate(des_nlp(sep_str)))[0]
            if (idx + 1 <= mx) and (idx - 1 >= 0):
                concat_tokens = ''.join([des_nlp(sep_str)[idx - 1].text, des_nlp(sep_str)[idx + 1].text])
                word_list.append(concat_tokens)
                #concat_str = ' '.join([concat_str, concat_tokens])
                words_to_remove.append(des_nlp(sep_str)[idx - 1].text)
                words_to_remove.append(des_nlp(sep_str)[idx + 1].text)
        else:
            word_list.append(text)
            #concat_str = ' '.join([concat_str, text])
    # Remove words previously seperated by '-'
    cleaned_word_list = [word for word in word_list if word not in words_to_remove]
    concat_str = ' '.join(cleaned_word_list)
    concat_st = concat_str.strip()
    
    return concat_str

In [31]:
data_cleaned_des = data_imputed.copy()

des_nlp = spacy.load("en_core_web_sm")
data_cleaned_des['Filtered Description'] = data_cleaned_des['Description'].apply(lambda x: clean_description(x, des_nlp))

In [32]:
data_cleaned_des

Unnamed: 0,Uniq Id,User Id,Product Url,Product Name,Description,List Price,Sale Price,Brand,Category,Available,Rating,Rating Count,Filtered Description
0,bc645d7939917c8c8a51cab46ac93ce3,10001,https://www.walmart.com/ip/Champro-Adult-First...,Champro Adult First Down Two-Tone Football Jer...,We aim to show you accurate product informati...,25.86,25.86,Champro,Sports & Outdoors | Sports Fan Shop | Clothing...,True,2.9,2.0,Dazzle polyester double ply cowl Heavy spand...
1,65e53225e9ed8cacfd17c81a41c84335,10141,https://www.walmart.com/ip/Adoretex-Men-s-Guar...,Adoretex Mens Guard Swim Board Short Swim Trun...,We aim to show you accurate product informati...,26.39,26.39,Adoretex,Clothing | Mens Clothing | Mens Swimwear | Men...,True,3.9,63.0,This Mens Swimwear Racer features a durable ...
2,fce147ac19d2f32a0179d9f883644b91,10074,https://www.walmart.com/ip/Alex-And-Ani-Caroli...,Alex And Ani Carolina Panthers Football Charm ...,We aim to show you accurate product informati...,12.60,12.60,Alex and Ani,Sports & Outdoors | Sports | Football Gear & E...,True,1.0,1.0,Alex And Ani Carolina Panthers Football Char...
3,572ab19ce683fbdfb2ee2b9512a48951,10182,https://www.walmart.com/ip/Tokelau-ScudoPro-Sh...,Tokelau ScudoPro Short Sleeve Cycling Jersey f...,We aim to show you accurate product informati...,42.99,42.99,ScudoPro,Clothing | Mens Clothing | Mens Graphic Tees,True,5.0,1.0,"Features Soft elastics on sleeves , pockets ..."
4,06b646bb5475ef339047309cf5fac206,10160,https://www.walmart.com/ip/Men-s-White-Knit-Pu...,Mens White Knit Pullover Golf Polo Shirt - Large,We aim to show you accurate product informati...,6.49,6.49,Christmas Central,Sports & Outdoors | Sports | Golf Equipment | ...,True,0.0,0.0,Featuring everything you need to travel or p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,3e4d423eca09a043684f1677cff75a6c,10040,https://www.walmart.com/ip/Men-s-OPP-Scent-Con...,Mens OPP Scent Control Camo Short Sleeve Tee S...,We aim to show you accurate product informati...,5.42,5.42,Mossy Oak,Sports & Outdoors | Outdoor Sports | Hunting |...,True,0.0,0.0,Whether you re looking for your next piece o...
945,3109c38241b3cc0584412051bc5cba40,10018,https://www.walmart.com/ip/R4B-Donna-Women-s-S...,R4B Donna - Womens Shoe W/ BOA - White/Emerald...,We aim to show you accurate product informati...,134.97,134.97,Fizik,Clothing | Shoes | Womens Shoes | All Womens S...,False,3.1,17.0,The R4B Donna offers a carefully crafted ble...
946,79c5af3a7dd93ee217cdd44b468d909b,10182,https://www.walmart.com/ip/Adidas-3-Stripes-1-...,Adidas 3 Stripes 1/4 Zip Layering Jacket,We aim to show you accurate product informati...,34.95,34.95,adidas,Sports & Outdoors | Sports | Soccer | Soccer C...,True,3.9,5.0,Adidas ¼ Zip Layering Jacket The Adidas 3Str...
947,6f377fb23da9129d8c5c25014dd8b96b,10134,https://www.walmart.com/ip/Under-Armour-Kids-L...,"Under Armour Kids Leadoff RM Baseball Cleats, ...",We aim to show you accurate product informati...,34.99,34.99,Under Armour,Clothing | Fashion Brands | Under Armour | Kids,False,3.3,3.0,Under Armour Kids Leadoff RM Baseball Cleats...


## Clean the categories

In [37]:
# Function to clean the categories
def clean_categories(cat, cat_nlp):
    # Concatenate tokens seperated by '-' to create compat words
    tokens = cat_nlp(cat)
    word_list = []
    words_to_remove = []
    
    for idx, token in enumerate(tokens):
        text = token.text
        if text == '-':
            mx = max(enumerate(tokens))[0]
            if (idx + 1 <= mx) and (idx - 1 >= 0):
                concat_tokens = ''.join([tokens[idx - 1].text, tokens[idx + 1].text])
                #concat_str = ' '.join([concat_str, concat_tokens])
                word_list.append(concat_tokens)
                words_to_remove.append(tokens[idx - 1].text)
                words_to_remove.append(tokens[idx + 1].text)
        else:
            word_list.append(text)
            #concat_str = ' '.join([concat_str, text])
    # Remove words previously seperated by '-'
    cleaned_word_list = [word for word in word_list if word not in words_to_remove]
    concat_str = ' '.join(cleaned_word_list)
    concat_str = concat_str.strip()

    return concat_str

In [248]:
data_cleaned_cat = data_cleaned_des.copy()

cat_nlp = spacy.load("en_core_web_sm")
data_cleaned_cat['Filtered Category'] = data_cleaned_cat['Category'].apply(lambda x: clean_categories(x, cat_nlp))

In [249]:
columns_ordered = ['Uniq Id', 'User Id', 'Product Url', 'Product Name', 'Description',
       'Filtered Description', 'List Price', 'Sale Price', 'Brand', 'Category',
       'Filtered Category', 'Available', 'Rating', 'Rating Count']

data_cleaned_cat = data_cleaned_cat[columns_ordered]

In [250]:
data_cleaned_cat.head(3)

Unnamed: 0,Uniq Id,User Id,Product Url,Product Name,Description,Filtered Description,List Price,Sale Price,Brand,Category,Filtered Category,Available,Rating,Rating Count
0,bc645d7939917c8c8a51cab46ac93ce3,10001,https://www.walmart.com/ip/Champro-Adult-First...,Champro Adult First Down Two-Tone Football Jer...,We aim to show you accurate product informati...,Dazzle polyester double ply cowl Heavy spand...,25.86,25.86,Champro,Sports & Outdoors | Sports Fan Shop | Clothing...,Sports & Outdoors | Sports Fan Shop | Clothing...,True,2.9,2.0
1,65e53225e9ed8cacfd17c81a41c84335,10141,https://www.walmart.com/ip/Adoretex-Men-s-Guar...,Adoretex Mens Guard Swim Board Short Swim Trun...,We aim to show you accurate product informati...,This Mens Swimwear Racer features a durable ...,26.39,26.39,Adoretex,Clothing | Mens Clothing | Mens Swimwear | Men...,Clothing | Mens Clothing | Mens Swimwear | Men...,True,3.9,63.0
2,fce147ac19d2f32a0179d9f883644b91,10074,https://www.walmart.com/ip/Alex-And-Ani-Caroli...,Alex And Ani Carolina Panthers Football Charm ...,We aim to show you accurate product informati...,Alex And Ani Carolina Panthers Football Char...,12.6,12.6,Alex and Ani,Sports & Outdoors | Sports | Football Gear & E...,Sports & Outdoors | Sports | Football Gear & E...,True,1.0,1.0


In [251]:
print(f'Cat: {data_cleaned_cat['Category'][0]}\nCleaned: {data_cleaned_cat['Filtered Category'][0]}')

Cat: Sports & Outdoors | Sports Fan Shop | Clothing Fan Shop | T-Shirts Fan Shop
Cleaned: Sports & Outdoors | Sports Fan Shop | Clothing Fan Shop | TShirts Fan Shop


In [252]:
# Checkpoint
data_cleaned = data_cleaned_cat.copy()

# Tagging

In [253]:
nlp = spacy.load("en_core_web_sm")

def clean_and_extract_tags(text):
    doc = nlp(text.lower())
    # Remove stopwords and non alphabetic words
    tags1 = [token for token in doc if token.text.isalpha() and token.text not in STOP_WORDS]
    # Select words with specific POS tags and lemmatize
    # JJ, JJR, JJS - adjective; NN, NNS - noun; NNP, NNPS - noun, proper
    pos_tags = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS']
    tags2 = [token.lemma_ for token in tags1 if token.tag_ in pos_tags]

    return ', '.join(tags2)

In [254]:
# Create a temporary dataframe copy for tagging
temp_df = data_cleaned.copy()
columns_to_extract_tags_from = ['Product Name', 'Filtered Description', 'Brand', 'Filtered Category']

for column in columns_to_extract_tags_from:
    temp_df[column] = temp_df[column].apply(clean_and_extract_tags)

In [255]:
# Concatenate the cleaned tags from all relevant columns
temp_df['Tags'] = temp_df[columns_to_extract_tags_from].apply(lambda row: ', '.join(row), axis=1)
# Remove duplicates
temp_df['Tags'] = temp_df['Tags'].apply(lambda t: ', '.join({token for token in ' '.join(t.split(',')).split()}))

In [256]:
# Add the tags to a separtate dataframe
data_with_tags = data_cleaned.copy()
data_with_tags['Tags'] = temp_df['Tags']
data_with_tags.head()

Unnamed: 0,Uniq Id,User Id,Product Url,Product Name,Description,Filtered Description,List Price,Sale Price,Brand,Category,Filtered Category,Available,Rating,Rating Count,Tags
0,bc645d7939917c8c8a51cab46ac93ce3,10001,https://www.walmart.com/ip/Champro-Adult-First...,Champro Adult First Down Two-Tone Football Jer...,We aim to show you accurate product informati...,Dazzle polyester double ply cowl Heavy spand...,25.86,25.86,Champro,Sports & Outdoors | Sports Fan Shop | Clothing...,Sports & Outdoors | Sports Fan Shop | Clothing...,True,2.9,2.0,"sport, durability, twotone, spandex, clothing,..."
1,65e53225e9ed8cacfd17c81a41c84335,10141,https://www.walmart.com/ip/Adoretex-Men-s-Guar...,Adoretex Mens Guard Swim Board Short Swim Trun...,We aim to show you accurate product informati...,This Mens Swimwear Racer features a durable ...,26.39,26.39,Adoretex,Clothing | Mens Clothing | Mens Swimwear | Men...,Clothing | Mens Clothing | Mens Swimwear | Men...,True,3.9,63.0,"swim, leg, mens, clothing, logo, smooth, slip,..."
2,fce147ac19d2f32a0179d9f883644b91,10074,https://www.walmart.com/ip/Alex-And-Ani-Caroli...,Alex And Ani Carolina Panthers Football Charm ...,We aim to show you accurate product informati...,Alex And Ani Carolina Panthers Football Char...,12.6,12.6,Alex and Ani,Sports & Outdoors | Sports | Football Gear & E...,Sports & Outdoors | Sports | Football Gear & E...,True,1.0,1.0,"shape, sport, rafaelian, important, clothing, ..."
3,572ab19ce683fbdfb2ee2b9512a48951,10182,https://www.walmart.com/ip/Tokelau-ScudoPro-Sh...,Tokelau ScudoPro Short Sleeve Cycling Jersey f...,We aim to show you accurate product informati...,"Features Soft elastics on sleeves , pockets ...",42.99,42.99,ScudoPro,Clothing | Mens Clothing | Mens Graphic Tees,Clothing | Mens Clothing | Mens Graphic Tees,True,5.0,1.0,"design, famous, advanced, phone, jerseys, mult..."
4,06b646bb5475ef339047309cf5fac206,10160,https://www.walmart.com/ip/Men-s-White-Knit-Pu...,Mens White Knit Pullover Golf Polo Shirt - Large,We aim to show you accurate product informati...,Featuring everything you need to travel or p...,6.49,6.49,Christmas Central,Sports & Outdoors | Sports | Golf Equipment | ...,Sports & Outdoors | Sports | Golf Equipment | ...,True,0.0,0.0,"measurement, sport, mens, instruction, button,..."


In [257]:
sample = data_with_tags['Tags'][1]
sample

'swim, leg, mens, clothing, logo, smooth, slip, polyester, closure, racer, seam, guard, builtin, waistband, coverage, maximum, moderate, unrestricted, small, left, inseam, movement, men, flat, drag, lightweight, pocket, elastic, comfortable, short, size, x, velcro, adoretex, durable, cord, key, construction, long, quick, support, draw, strong, comfort, mesh, liner, drying, cargo, trunk, large, red, board'

# Generate Product IDs

In [261]:
product_key = 157890
prod_ids = [i for i in range(0, data_with_tags.shape[0], 1)]
prod_ids = pd.Series(prod_ids) + product_key

In [262]:
# Assign new product ids to the product dataset
data_prod_ids_set = data_with_tags.copy()
data_prod_ids_set['Uniq Id'] = prod_ids
data_prod_ids_set = data_prod_ids_set.drop(['User Id'], axis=1)
data_prod_ids_set.columns = ['Prod Id', 'Product Url', 'Product Name', 'Description',
       'Filtered Description', 'List Price', 'Sale Price', 'Brand', 'Category',
       'Filtered Category', 'Available', 'Rating', 'Rating Count', 'Tags']

In [263]:
data_prod_ids_set.head()

Unnamed: 0,Prod Id,Product Url,Product Name,Description,Filtered Description,List Price,Sale Price,Brand,Category,Filtered Category,Available,Rating,Rating Count,Tags
0,157890,https://www.walmart.com/ip/Champro-Adult-First...,Champro Adult First Down Two-Tone Football Jer...,We aim to show you accurate product informati...,Dazzle polyester double ply cowl Heavy spand...,25.86,25.86,Champro,Sports & Outdoors | Sports Fan Shop | Clothing...,Sports & Outdoors | Sports Fan Shop | Clothing...,True,2.9,2.0,"sport, durability, twotone, spandex, clothing,..."
1,157891,https://www.walmart.com/ip/Adoretex-Men-s-Guar...,Adoretex Mens Guard Swim Board Short Swim Trun...,We aim to show you accurate product informati...,This Mens Swimwear Racer features a durable ...,26.39,26.39,Adoretex,Clothing | Mens Clothing | Mens Swimwear | Men...,Clothing | Mens Clothing | Mens Swimwear | Men...,True,3.9,63.0,"swim, leg, mens, clothing, logo, smooth, slip,..."
2,157892,https://www.walmart.com/ip/Alex-And-Ani-Caroli...,Alex And Ani Carolina Panthers Football Charm ...,We aim to show you accurate product informati...,Alex And Ani Carolina Panthers Football Char...,12.6,12.6,Alex and Ani,Sports & Outdoors | Sports | Football Gear & E...,Sports & Outdoors | Sports | Football Gear & E...,True,1.0,1.0,"shape, sport, rafaelian, important, clothing, ..."
3,157893,https://www.walmart.com/ip/Tokelau-ScudoPro-Sh...,Tokelau ScudoPro Short Sleeve Cycling Jersey f...,We aim to show you accurate product informati...,"Features Soft elastics on sleeves , pockets ...",42.99,42.99,ScudoPro,Clothing | Mens Clothing | Mens Graphic Tees,Clothing | Mens Clothing | Mens Graphic Tees,True,5.0,1.0,"design, famous, advanced, phone, jerseys, mult..."
4,157894,https://www.walmart.com/ip/Men-s-White-Knit-Pu...,Mens White Knit Pullover Golf Polo Shirt - Large,We aim to show you accurate product informati...,Featuring everything you need to travel or p...,6.49,6.49,Christmas Central,Sports & Outdoors | Sports | Golf Equipment | ...,Sports & Outdoors | Sports | Golf Equipment | ...,True,0.0,0.0,"measurement, sport, mens, instruction, button,..."


In [265]:
# # Export
# data_prod_ids_set.to_csv('data_with_tags.csv', index=False)

In [267]:
list(data_prod_ids_set.columns.values)

['Prod Id',
 'Product Url',
 'Product Name',
 'Description',
 'Filtered Description',
 'List Price',
 'Sale Price',
 'Brand',
 'Category',
 'Filtered Category',
 'Available',
 'Rating',
 'Rating Count',
 'Tags']

In [268]:
data_prod_ids_set.dtypes

Prod Id                   int64
Product Url              object
Product Name             object
Description              object
Filtered Description     object
List Price              float64
Sale Price              float64
Brand                    object
Category                 object
Filtered Category        object
Available                  bool
Rating                  float64
Rating Count            float64
Tags                     object
dtype: object