# Predicting Resale Value of Knives from a Texas Government Surplus Store
## Using Machine Learning to Support an Ebay Store's Financial Success

### Data Obtainment Notebook


This notebook displays the code used to collect and process data from eBay using two of eBay's public APIs and scraping from their proprietary webabb "Terapeak".

In [2]:
from ebaysdk.finding import Connection
import requests
from ebaysdk.shopping import Connection as Shopping
import pandas as pd 
import  json
import numpy as np
import re
# import preprocess_ddey117 as pp
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
import ast

import seaborn as sns 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Define Necessary Functions

In [3]:
#This function is a helper function created for the "knife_request" below. 
#It unpacks some of the nested data from eBay API calls 
#It also creates the new feature "converted_price"
#"converted_price" is the price of the item for sale plus shipping cost.
def prepare_df(df):
    price_list = []
    ship_price_list = []
    condition_list = []
    condition = None
    for row in full_dataset:
        listed_price = float(row['sellingStatus']['convertedCurrentPrice']['value'])
        price_list.append(listed_price)
     
        try:
            listed_ship_price = float(row['shippingInfo']['shippingServiceCost']['value'])
            ship_price_list.append(listed_ship_price)
        except: 
            listed_ship_price = 0
            ship_price_list.append(listed_ship_price)

        try:
            condition = float(row['condition']['conditionId'])
            condition_list.append(condition)
        except: 
            conditon = 0
            condition_list.append(condition)

    df['shipping_cost'] = ship_price_list
    df['price_in_US'] = price_list
    df['condition'] = condition_list
    
    #create new feature 'converted price'
    df['converted_price'] = df['shipping_cost'] + df['price_in_US']
    df.drop_duplicates(subset=['itemId'],  keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

#dictionary for preparing brands
bucket_dict = {'benchmade': 45.0,
               'buck': 20.0,
               'case': 20.0,
               'crkt': 15.0,
               'kershaw': 15.0,
               'sog': 15.0,
               'spyderco': 30.0,
               'victorinox': 20.0
              }

#a helper function used with knife_request 
#it is used to create new columns of interest
#the brand of knife from the API call
#the cost of the knife from the Surplus Store
#profit for reselling a used surplus knife on eBay
#Return on Investment for reselling the knife
#All columns in US dollars
def prepare_brands(df, bucket_dict_position, overhead_cost=3):

    df.title = df.title.apply(str.lower)
 
    #remove special characters
#     df.title.apply(pp.remove_special_chars)
    df['brand'] = str(list(bucket_dict.keys())[bucket_dict_position])
    df['cost'] = float(list(bucket_dict.values())[bucket_dict_position])
    df['profit'] = ((df['converted_price']*.87) -  df['cost'] - overhead_cost)
    df['ROI'] = (df['profit']/( df['cost'] + overhead_cost))*100.0
    
    return df
# Help organize paginated data from API calls
def prepare_data(data_list):
    """
    This function takes in a list of dictionaries and prepares it
    for analysis
    """
    
    # Make a new list to hold results
    results = []
    
    for business_data in data_list:
    
        # Make a new dictionary to hold prepared data for this business
        prepared_data = {}
        
        # Extract name, review_count, rating, and price key-value pairs
        # from business_data and add to prepared_data
        # If a key is not present in business_data, add it to prepared_data
        # with an associated value of None
        
        keys = ['itemId', 'title', 'galleryURL', 
                'viewItemURL', 'autoPay', 'postalCode', 
                'sellingStatus', 'shippingInfo', 'listingInfo',
                'returnsAccepted', 'condition', 'topRatedListing',
                'galleryPlusPictureURL','pictureURLLarge', 
                'pictureURLSuperSize']
        
        for key in keys:
            prepared_data[key] = business_data.get(key, None)
            results.append(prepared_data)
    
       
        # Add to list if all values are present
#         if all(prepared_data.values()):
#             results.append(prepared_data)
    
    
    return results
#main function for making findingAPI calls to eBay
def knife_request(Brand, dict_pos):
    api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")
    #first request gets number of pages from paginationOutput of first page 
    request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': Brand}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': 1

                                    },

                }

    #     request['paginationInput']['pageNumber'] = page

    response = api.execute('findItemsAdvanced', request)


    response_pages = response.dict()

    full_dataset = []
    
    total_pages = int(response_pages['paginationOutput']['totalPages'])

    if total_pages > 100:
        pages_to_request = 100
        
    else:
        pages_to_request = total_pages - 1
        #subtract number of pages by one to avoid errors
        
    #Loop through available pages
    for page in range(1, pages_to_request):
        # Add or update the "offset" key-value pair in url_params

        # Make the query and get the response

        api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

        request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': Brand}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': page

                                    },

                }


        response = api.execute('findItemsAdvanced', request)

        #save the response as a json dict
        response_dict = response.dict()


        #index dict to appropriate index
        results_list_of_dicts = response_dict['searchResult']['item']

        # Call the prepare_data function to get a list of processed data
        prepared_knives = prepare_data(results_list_of_dicts)

        # Extend full_dataset with this list (don't append, or you'll get
        # a list of lists instead of a flat list)
        full_dataset.extend(prepared_knives)

    # Check the length of the full dataset. It will be up to `total`,
    # potentially less if there were missing values
    display(len(full_dataset))
    
    df = pd.DataFrame(full_dataset)
    
    df = prepare_df(df)
    
    df = prepare_brands(df, dict_pos)
    
    return df
#Used to prepare data from eBays shopping API
#Shopping API used to collect more detailed info
#about individual knives
def prepare_dataIds(data_list):
    """
    This function takes in a list of dictionaries and prepares it
    for analysis
    """
    
    # Make a new list to hold results
    results = []
    
    for business_data in data_list:
    
        # Make a new dictionary to hold prepared data for this business
        prepared_data = {}
        
        # Extract name, review_count, rating, and price key-value pairs
        # from business_data and add to prepared_data
        # If a key is not present in business_data, add it to prepared_data
        # with an associated value of None
        
        keys = ['ItemID','GalleryURL','PictureURL',
                'Location','ConvertedCurrentPrice',
                'Title','ItemSpecifics', 
                'Country','ConditionID']
        
        for key in keys:
            prepared_data[key] = business_data.get(key, None)
            results.append(prepared_data)
    
       
        # Add to list if all values are present
#         if all(prepared_data.values()):
#             results.append(prepared_data)
    
    
    return results
#Shopping api accepts a max of 20 itemIDs
#this function was created to automate
#making API calls in 20 unique itemId chuncks
def process_list(my_list):
 
    api = Shopping(config_file='ebay.yaml', debug=False, siteid="EBAY-US")
    request = {
               'itemID': my_list,
               'IncludeSelector': 'ItemSpecifics'
              }
    response = api.execute('GetMultipleItems', request)

    

    #save the response as a json dict
    response_dict = response.dict()



    #index dict to appropriate index
    results_list_of_dicts = response_dict['Item']

    # Call the prepare_data function to get a list of processed data
    prepared_knives = prepare_dataIds(results_list_of_dicts)

    # Extend full_dataset with this list (don't append, or you'll get
    # a list of lists instead of a flat list)
    full_dataset.extend(prepared_knives)
    
    return full_dataset

bucket_dict = {'benchmade': 45.0,
               'buck': 20.0,
               'case': 20.0,
               'crkt': 15.0,
               'kershaw': 15.0,
               'sog': 15.0,
               'spyderco': 30.0,
               'victorinox': 20.0
              }
#special function for reformatting terapeak scraped data
#x = position of bucket_dictionary
def prepare_tera_df(df, x, overhead_cost=3):
    df['price_in_US'] = df['price_in_US'].str.replace("$", "")
    df['price_in_US'] = df['price_in_US'].str.replace(",", "")
    df['price_in_US'] = df['price_in_US'].apply(float)
    
    df['shipping_cost'] = df['shipping_cost'].str.replace("$", "")
    df['shipping_cost'] = df['shipping_cost'].str.replace(",", "")
    df['shipping_cost'] = df['shipping_cost'].apply(float)
    
    df['converted_price'] = (df['price_in_US'] + df['shipping_cost'])
    
    df['profit'] = ((df['converted_price']*.87) - list(bucket_dict.values())[x] - overhead_cost)
    df['ROI'] = (df['profit']/(list(bucket_dict.values())[x]))*100.0
    
    df['brand'] = list(bucket_dict.keys())[x]
    df['cost'] = list(bucket_dict.values())[x]

    
    return df

# helper function with "transform_item_specifics"
def fix(col):
    dd = dict()
    for d in col:
        values = list(d.values())
        if len(values) == 2:
            dd[values[0]] = values[1]
    return dd

#function for extracted item Specifics from Shopping API data
def transform_item_specifics(df, perc=65.0):

    df.dropna(subset=['ItemSpecifics'], inplace=True)
    df['ItemSpecifics'] = df['ItemSpecifics'].apply(lambda x: ast.literal_eval(x))
    df['item_list'] = df['ItemSpecifics'].apply(lambda x: x['NameValueList'])

    df['ItemSpecifics'] = df['ItemSpecifics'].apply(lambda x: [x['NameValueList']] if isinstance(x['NameValueList'], dict) else x['NameValueList'])

    df['ItemSpecifics'] = df['ItemSpecifics'].apply(fix)

    df = pd.json_normalize(df['ItemSpecifics'])

    min_count =  int(((100-perc)/100)*df.shape[0] + 1)
    mod_df = df.dropna(axis=1, 
                       thresh=min_count)

    return mod_df

# This function removes noisy data
#lots/sets/groups of knives can
#confuse the model from predicting
#the appropriate value of individual knives
def data_cleaner(df):
    lot = re.compile('(?<!-\S)lot(?![^\s.,:?!])')
    group = re.compile('(group)')
    is_set = re.compile('(?<!-\S)set(?![^\s.,?!])')
    df['title'] = df['title'].str.lower()
    trim_list = [lot,group,is_set]
    for item in trim_list:
        df.loc[df['title'].apply(lambda x: re.search(item, x)).notnull(), 'trim'] = 1 
    to_drop = df.loc[df['trim'] == 1].index
    df.drop(to_drop, inplace=True)
    df.drop('trim', axis=1, inplace=True)
    
    return df

In [4]:
bucket_dict

{'benchmade': 45.0,
 'buck': 20.0,
 'case': 20.0,
 'crkt': 15.0,
 'kershaw': 15.0,
 'sog': 15.0,
 'spyderco': 30.0,
 'victorinox': 20.0}

Beginning of API calls for listed data. To be merged with item specific data using ebay itemIds.

### Domain Understading: Cost Breakdown
- padded envelopes: \$0.50 per knife
- flatrate shipping: \$4.45 per knife
- brand knife at surplus store: 15, 20, 30, or 45 dollars per knife
- overhead expenses (gas, cleaning suplies, sharpening supplies, etc): $3
- Ebay's comission, with 13\% being a reasonable approximation

## Listed Data

Running functions to call the Finding API and return datasets for cat () knives for sale listed on ebay in the last 90 days. (explain how ebay rules work)

```
bench_df = knife_request('Benchmade', 0)
buck_df = knife_request('Buck', 1)
case_df = knife_request('Case', 2)
df_caseXX = knife_request('Case XX', 2)
df_crkt = knife_request("CRKT", 3)
df_sog = knife_request('SOG', 5)
df_spyderco = knife_request('Spyderco', 6)


bench_df.to_csv('listed_data/df_bench1.csv', index=False)
buck_df.to_csv('listed_data/df_buck.csv', index=False)
case_df.to_csv('listed_data/df_case.csv', index=False)
df_caseXX.to_csv('listed_data/df_CaseXX.csv', index=False)
df_crkt.to_csv('listed_data/df_crkt.csv', index=False)
df_sog.to_csv('listed_data/df_sog.csv', index=False)
df_spyderco.to_csv('listed_data/df_spyderco.csv', index=False)
```

Kershaw and victorinox data was requested using the FindingAPI below after tweaking some pagination through trial and error to maximize data.

```
full_dataset = []
for page in range(1, 57):
#         # Add or update the "offset" key-value pair in url_params

#         # Make the query and get the response

    api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

    request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': 'Kershaw'}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': page

                                    },

                }

        #     request['paginationInput']['pageNumber'] = page

    response = api.execute('findItemsAdvanced', request)

    #save the response as a json dict
    response_dict = response.dict()

    #index dict to appropriate index
    results_list_of_dicts = response_dict['searchResult']['item']

    # Call the prepare_data function to get a list of processed data
    prepared_knives = prepare_data(results_list_of_dicts)

    # Extend full_dataset with this list (don't append, or you'll get
    # a list of lists instead of a flat list)
    full_dataset.extend(prepared_knives)

    # Check the length of the full dataset. It will be up to `total`,
    # potentially less if there were missing values

    df = pd.DataFrame(full_dataset)
    
df_kershaw = prepare_df(df)
df_kershaw = prepare_brands(df_kershaw, 4)
df_kershaw.to_csv('listed_data/df_kershaw.csv', index=False)

```


```
full_dataset = []
for page in range(1, 86):

    api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

    request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': 'Victorinox'}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': page

                                    },

                }

    response = api.execute('findItemsAdvanced', request)

    response_dict = response.dict()

    results_list_of_dicts = response_dict['searchResult']['item']

    prepared_knives = prepare_data(results_list_of_dicts)

    full_dataset.extend(prepared_knives)
    
df_victorinox = pd.DataFrame(full_dataset)
df_victorinox = prepare_df(df_victorinox)
df_victorinox = prepare_brands(df_victorinox, 7)
df_victorinox.to_csv('listed_data/df_victorinox.csv', index=False)
```

start of API call section using IDs from preview listed datasets to get Item Specific data from ebay. This will return more descriptive information about the knives, pulling from a container on the website that sellers must complete to post a listing. 

In [39]:
df_bench = pd.read_csv("listed_data/df_bench.csv")
df_buck = pd.read_csv("listed_data/df_buck.csv")
df_case = pd.read_csv("listed_data/df_case.csv")
df_caseXX = pd.read_csv("listed_data/df_CaseXX.csv")
df_crkt = pd.read_csv("listed_data/df_crkt.csv")
df_kersh = pd.read_csv("listed_data/df_kershaw.csv")
df_sog = pd.read_csv("listed_data/df_sog.csv")
df_spyd = pd.read_csv("listed_data/df_spyderco.csv")
df_vict = pd.read_csv("listed_data/df_victorinox.csv")

In [6]:
df_list = [df_bench,df_buck,
           df_case,df_caseXX,
           df_crkt,df_kersh,
           df_sog,df_spyd,
           df_vict]

for dataframe in df_list:
    dataframe.drop('galleryPlusPictureURL', axis=1, inplace=True)
    

In [7]:
benchIds = df_bench.itemId.values.tolist()
buckIds = df_buck.itemId.values.tolist()
caseIds = df_case.itemId.values.tolist()
caseXXIds = df_caseXX.itemId.values.tolist()
crktIds = df_crkt.itemId.values.tolist()
kershawIds = df_kersh.itemId.values.tolist()
sogIds = df_sog.itemId.values.tolist()
spydIds = df_spyd.itemId.values.tolist()
victIds = df_vict.itemId.values.tolist()

ShoppingAPI call to return benchmade item specific data.
```
full_dataset = []
for i in range(0, len(benchIds), 20):
    process_list(benchIds[i:i+20])

bench = pd.DataFrame(full_dataset)
bench.drop_duplicates(subset=['ItemID'], inplace=True)
bench.info()
```

ShoppingAPI call to return buck item specific data.
```
full_dataset = []
for i in range(0, len(buckIds), 20):
    process_list(buckIds[i:i+20])

buck = pd.DataFrame(full_dataset)
buck.drop_duplicates(subset=['ItemID'], inplace=True)
buck.info()
```

ShoppingAPI call to return case brand item specific data.
```
full_dataset = []
for i in range(0, len(caseIds), 20):
    process_list(caseIds[i:i+20])

df_case = pd.DataFrame(full_dataset)
df_case.drop_duplicates(subset=['ItemID'], inplace=True)
df_case.info()
```

ShoppingAPI call to return caseXX brand item specific data.
```
full_dataset = []
for i in range(0, len(caseXXIds), 20):
    process_list(caseXXIds[i:i+20])

df_caseXX = pd.DataFrame(full_dataset)
df_caseXX.drop_duplicates(subset=['ItemID'], inplace=True)
df_caseXX.info()
```

ShoppingAPI call to return crkt item specific data.
```
full_dataset = []
for i in range(0, len(crktIds), 20):
    process_list(crktIds[i:i+20])

crkt = pd.DataFrame(full_dataset)
crkt.drop_duplicates(subset=['ItemID'], inplace=True)
crkt.info()
```

ShoppingAPI call to return kershaw item specific data.
```
full_dataset = []
for i in range(0, len(kershawIds), 20):
    process_list(kershawIds[i:i+20])

kershaw = pd.DataFrame(full_dataset)
kershaw.drop_duplicates(subset=['ItemID'], inplace=True)
kershaw.info()
```

ShoppingAPI call to return SOG item specific data.
```
full_dataset = []
for i in range(0, len(sogIds), 20):
    process_list(sogIds[i:i+20])

sog = pd.DataFrame(full_dataset)
sog.drop_duplicates(subset=['ItemID'], inplace=True)
sog.info()
```

#ShoppingAPI call to return spyderco item specific data.
```
full_dataset = []
for i in range(0, len(spydIds), 20):
    process_list(spydIds[i:i+20])
spyd = pd.DataFrame(full_dataset)
spyd.drop_duplicates(subset=['ItemID'], inplace=True)
spyd.info()
```

ShoppingAPI call to return victorinox item specific data.
```
full_dataset = []
for i in range(0, len(victIds), 20):
    process_list(victIds[i:i+20])
    
vict = pd.DataFrame(full_dataset)
vict.drop_duplicates(subset=['ItemID'], inplace=True)
vict.info()
```

```
bench.to_csv("listed_data/benchIds.csv", index=False)
buck.to_csv("listed_data/buckIds.csv", index=False)
df_case.to_csv("listed_data/caseIds.csv", index=False)
df_caseXX.to_csv("listed_data/caseXXIds.csv", index=False)
crkt.to_csv("listed_data/crktIds.csv", index=False)
kershaw.to_csv("listed_data/kershawIds.csv", index=False)
leath.to_csv("listed_data/leathIds.csv", index=False)
sog.to_csv("listed_data/sogIds.csv", index=False)
spyd.to_csv("listed_data/spydIds.csv", index=False)
vict.to_csv("listed_data/victIds.csv", index=False)
```

Beginning of prep to merge original listed data with item specific data requested using a seperate API for more complete details about all listings gathered.

In [40]:
bench = pd.read_csv("listed_data/benchIds.csv")
buck = pd.read_csv("listed_data/buckIds.csv")
case = pd.read_csv("listed_data/caseIds.csv")
caseXX = pd.read_csv("listed_data/caseXXIds.csv")
crkt = pd.read_csv("listed_data/crktIds.csv")
kershaw = pd.read_csv("listed_data/kershawIds.csv")
sog = pd.read_csv("listed_data/sogIds.csv")
spyd = pd.read_csv("listed_data/spydIds.csv")
vict = pd.read_csv("listed_data/victIds.csv")

In [24]:
Ids_df = pd.concat([bench,buck,
                   case,caseXX,
                   crkt,kershaw,
                   sog,spyd,vict])

In [25]:
df_list = [bench,buck,
           case,caseXX,
           crkt,kershaw,
           sog,spyd]

for dataframe in df_list:
    display(dataframe.columns)
    

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

Index(['ItemID', 'GalleryURL', 'PictureURL', 'Location', 'ConvertedCurrentPrice', 'Title', 'ItemSpecifics', 'Country', 'ConditionID'], dtype='object')

In [26]:
df_list = [bench,buck,
           case,caseXX,
           crkt,kershaw,
           sog,spyd,vict]

for dataframe in df_list:
    dataframe.rename({'Title': 'title',
                      'ItemID': 'itemId'},
                     axis=1,inplace=True)
    
    dataframe.drop(['ConditionID','ConvertedCurrentPrice'], 
                   axis=1, inplace=True)
    dataframe['title'] = dataframe['title'].str.lower()

In [27]:
#merge Item Specific dataframes with original listed data using itemIds and title
bench_merged = df_bench.merge(bench)
buck_merged = df_buck.merge(buck)
case_merged = df_case.merge(case)
caseXX_merged = df_caseXX.merge(caseXX)
crkt_merged = df_crkt.merge(crkt)
kershaw_merged = df_kersh.merge(kershaw)
spyd_merged = df_spyd.merge(spyd)
sog_merged = df_sog.merge(sog)
vict_merged = df_vict.merge(vict)

In [28]:
bench_spec = transform_item_specifics(bench_merged)
buck_spec = transform_item_specifics(buck_merged)
case_spec = transform_item_specifics(case_merged)
caseXX_spec = transform_item_specifics(caseXX_merged)
crkt_spec = transform_item_specifics(crkt_merged)
kershaw_spec = transform_item_specifics(kershaw_merged)
sog_spec = transform_item_specifics(sog_merged)
spyd_spec = transform_item_specifics(spyd_merged)
vict_spec = transform_item_specifics(vict_merged)

In [29]:
specs_list = [bench_spec, buck_spec,
              case_spec, caseXX_spec,
              crkt_spec, kershaw_spec,
              sog_spec, spyd_spec,
              vict_spec]

In [30]:
for dataframe in specs_list:
    dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979 entries, 0 to 1978
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Blade Material                 1008 non-null   object
 1   Model                          1696 non-null   object
 2   Product Line                   772 non-null    object
 3   Opening Mechanism              1105 non-null   object
 4   Number of Blades               1180 non-null   object
 5   Handle Material                1301 non-null   object
 6   Blade Type                     1042 non-null   object
 7   Brand                          1947 non-null   object
 8   Color                          1431 non-null   object
 9   Type                           1497 non-null   object
 10  Country/Region of Manufacture  1166 non-null   object
 11  Lock Type                      831 non-null    object
 12  Blade Edge                     1045 non-null   object
 13  Dex

In [31]:
for dataframe in specs_list:
    dataframe.drop('Brand', axis=1, inplace=True)

In [33]:
tot_bench = bench_merged.join(bench_spec)
tot_buck = buck_merged.join(buck_spec)
tot_case = case_merged.join(case_spec)
tot_caseXX = caseXX_merged.join(caseXX_spec)
tot_crkt = crkt_merged.join(crkt_spec)
tot_kershaw = kershaw_merged.join(kershaw_spec)
tot_sog = sog_merged.join(sog_spec)
tot_spyd = spyd_merged.join(spyd_spec)
tot_vict = vict_merged.join(vict_spec)

In [34]:
tot_bench.to_csv('listed_data/total_list_bench.csv', index=False)
tot_buck.to_csv('listed_data/total_list_buck.csv', index=False)
tot_case.to_csv('listed_data/total_list_case.csv', index=False)
tot_caseXX.to_csv('listed_data/total_list_caseXX.csv', index=False)
tot_crkt.to_csv('listed_data/total_list_crkt.csv', index=False)
tot_kershaw.to_csv('listed_data/total_list_kershaw.csv', index=False)
tot_sog.to_csv('listed_data/total_list_sog.csv', index=False)
tot_spyd.to_csv('listed_data/total_list_spyd.csv', index=False)
tot_vict.to_csv('listed_data/total_list_vict.csv', index=False)

In [46]:
df_bench = pd.read_csv("listed_data/df_bench.csv")
df_buck = pd.read_csv("listed_data/df_buck.csv")
df_case = pd.read_csv("listed_data/df_case.csv")
df_caseXX = pd.read_csv("listed_data/df_CaseXX.csv")
df_crkt = pd.read_csv("listed_data/df_crkt.csv")
df_kersh = pd.read_csv("listed_data/df_kershaw.csv")
df_sog = pd.read_csv("listed_data/df_sog.csv")
df_spyd = pd.read_csv("listed_data/df_spyderco.csv")
df_vict = pd.read_csv("listed_data/df_victorinox.csv")



bench = pd.read_csv("listed_data/benchIds.csv")
buck = pd.read_csv("listed_data/buckIds.csv")
case = pd.read_csv("listed_data/caseIds.csv")
caseXX = pd.read_csv("listed_data/caseXXIds.csv")
crkt = pd.read_csv("listed_data/crktIds.csv")
kershaw = pd.read_csv("listed_data/kershawIds.csv")
sog = pd.read_csv("listed_data/sogIds.csv")
spyd = pd.read_csv("listed_data/spydIds.csv")
vict = pd.read_csv("listed_data/victIds.csv")

In [47]:
listed_df = pd.concat([df_bench,df_buck,
                       df_case,df_caseXX,
                       df_crkt,df_kersh,
                       df_sog,df_spyd,
                       df_vict])

listed_df.drop('galleryPlusPictureURL', axis=1, inplace=True)

Ids_df = pd.concat([bench,buck,
                   case,caseXX,
                   crkt,kershaw,
                   sog,spyd,vict])



Ids_df.rename({'Title': 'title',
               'ItemID': 'itemId'},
               axis=1,inplace=True)
    
Ids_df.drop(['ConditionID','ConvertedCurrentPrice'], 
             axis=1, inplace=True)




Ids_df['title'] = Ids_df['title'].str.lower()


df_merged = listed_df.merge(Ids_df)

df_spec = transform_item_specifics(df_merged, perc=65.0)

df_spec.drop('Brand', axis=1, inplace=True)

tot_listed_df = df_merged.join(df_spec)

listed_knives = data_cleaner(tot_listed_df).copy()
listed_knives.drop(['sellingStatus', 'shippingInfo', 
                    'GalleryURL', 'ItemSpecifics', 
                    'item_list', 'listingInfo'], 
                    axis=1, inplace=True)
listed_used_knives = listed_knives.loc[listed_knives['condition'] != 1000.0]
listed_used_knives.reset_index(drop=True, inplace=True)

In [48]:
listed_used_knives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11855 entries, 0 to 11854
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   itemId                         11855 non-null  int64  
 1   title                          11855 non-null  object 
 2   galleryURL                     11854 non-null  object 
 3   viewItemURL                    11855 non-null  object 
 4   autoPay                        11855 non-null  bool   
 5   postalCode                     11576 non-null  object 
 6   returnsAccepted                11855 non-null  bool   
 7   condition                      11854 non-null  float64
 8   topRatedListing                11855 non-null  bool   
 9   pictureURLLarge                11208 non-null  object 
 10  pictureURLSuperSize            11159 non-null  object 
 11  shipping_cost                  11855 non-null  float64
 12  price_in_US                    11855 non-null 

In [49]:
listed_knives.drop(['Original/Reproduction'],
                    axis=1, inplace=True)

In [50]:
listed_knives.to_csv("listed_data/listed_knives_df.csv", index=False)

End of section for obtaining listed data from eBay APIs. Below is the start of processing scraped data from eBay's seller exclusive website. This data goes back 2 years and is filtered to include only used knives with final sale values. The listed data above only goes back 90 days and only shows listings currently up for sale. 

In [51]:
sold_bench = pd.read_csv("terapeak_data/bench_scraped2.csv")
sold_buck1 = pd.read_csv("terapeak_data/buck_scraped2.csv")
sold_buck2 = pd.read_csv("terapeak_data/buck_scraped2_reversed.csv")
sold_case = pd.read_csv("terapeak_data/case_scraped2.csv")
sold_caseXX1 = pd.read_csv("terapeak_data/caseXX_scraped2.csv")
sold_caseXX2 = pd.read_csv("terapeak_data/caseXX2_reversed.csv")
sold_crkt = pd.read_csv("terapeak_data/crkt_scraped.csv")
sold_kershaw1 = pd.read_csv("terapeak_data/kershaw_scraped2.csv")
sold_kershaw2 = pd.read_csv("terapeak_data/kershaw_scraped2_reversed.csv")
sold_sog = pd.read_csv("terapeak_data/SOG_scraped2.csv")
sold_spyd = pd.read_csv("terapeak_data/spyd_scraped2.csv")
sold_vict1 = pd.read_csv("terapeak_data/vict_scraped.csv")
sold_vict2 = pd.read_csv("terapeak_data/vict_reversed.csv")

sold_list = [sold_bench,sold_buck1,
             sold_buck2,sold_case,
             sold_caseXX1,sold_caseXX2,
             sold_crkt,sold_kershaw1,
             sold_kershaw2,sold_sog, 
             sold_spyd, sold_vict1,
             sold_vict2]

In [52]:
df_dict = {'benchmade': sold_bench, 
           'buck1': sold_buck1,
           'buck2': sold_buck2,
           'case':sold_case,
           'caseXX1':sold_caseXX1,
           'caseXX2':sold_caseXX2,
           'crkt':sold_crkt,
           'kershaw1':sold_kershaw1,
           'kershaw2':sold_kershaw2,
           'sog':sold_sog, 
           'spyderco':sold_spyd,
           'vict1':sold_vict1,
           'vict2':sold_vict2}
          

for key,val in df_dict.items():
    print(key)
    display(val.info())

benchmade
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8791 entries, 0 to 8790
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        8791 non-null   object
 1   url          1843 non-null   object
 2   date_sold    8791 non-null   object
 3   price_in_US  8791 non-null   object
 4   shipping_    8791 non-null   object
 5   Text         8791 non-null   object
dtypes: object(6)
memory usage: 412.2+ KB


None

buck1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        9999 non-null   object
 1   url          3659 non-null   object
 2   date_sold    9999 non-null   object
 3   price_in_US  9999 non-null   object
 4   shipping_    9999 non-null   object
 5   Text         9999 non-null   object
dtypes: object(6)
memory usage: 468.8+ KB


None

buck2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8918 entries, 0 to 8917
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        8918 non-null   object
 1   url          4 non-null      object
 2   date_sold    8918 non-null   object
 3   price_in_US  8918 non-null   object
 4   shipping_    8918 non-null   object
 5   Text         8918 non-null   object
dtypes: object(6)
memory usage: 418.2+ KB


None

case
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        9999 non-null   object
 1   url          2198 non-null   object
 2   date_sold    9999 non-null   object
 3   price_in_US  9999 non-null   object
 4   shipping_    9999 non-null   object
 5   Text         9999 non-null   object
dtypes: object(6)
memory usage: 468.8+ KB


None

caseXX1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        9999 non-null   object
 1   url          3309 non-null   object
 2   date_sold    9999 non-null   object
 3   price_in_US  9999 non-null   object
 4   shipping_    9999 non-null   object
 5   Text         9999 non-null   object
dtypes: object(6)
memory usage: 468.8+ KB


None

caseXX2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        8494 non-null   object
 1   url          10 non-null     object
 2   date_sold    8494 non-null   object
 3   price_in_US  8494 non-null   object
 4   shipping_    8494 non-null   object
 5   Text         8494 non-null   object
dtypes: object(6)
memory usage: 398.3+ KB


None

crkt
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6742 entries, 0 to 6741
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        6742 non-null   object
 1   url          1272 non-null   object
 2   date_sold    6742 non-null   object
 3   price_in_US  6742 non-null   object
 4   shipping_    6742 non-null   object
 5   Text         6742 non-null   object
dtypes: object(6)
memory usage: 316.2+ KB


None

kershaw1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        10000 non-null  object
 1   url          3370 non-null   object
 2   date_sold    10000 non-null  object
 3   price_in_US  10000 non-null  object
 4   shipping_    10000 non-null  object
 5   Text         10000 non-null  object
dtypes: object(6)
memory usage: 468.9+ KB


None

kershaw2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9447 entries, 0 to 9446
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        9447 non-null   object
 1   url          11 non-null     object
 2   date_sold    9447 non-null   object
 3   price_in_US  9447 non-null   object
 4   shipping_    9447 non-null   object
 5   Text         9447 non-null   object
dtypes: object(6)
memory usage: 443.0+ KB


None

sog
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4858 entries, 0 to 4857
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        4858 non-null   object
 1   url          900 non-null    object
 2   date_sold    4858 non-null   object
 3   price_in_US  4858 non-null   object
 4   shipping_    4858 non-null   object
 5   Text         4858 non-null   object
dtypes: object(6)
memory usage: 227.8+ KB


None

spyderco
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9206 entries, 0 to 9205
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        9206 non-null   object
 1   url          1936 non-null   object
 2   date_sold    9206 non-null   object
 3   price_in_US  9206 non-null   object
 4   shipping_    9206 non-null   object
 5   Text         9206 non-null   object
dtypes: object(6)
memory usage: 431.7+ KB


None

vict1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7799 entries, 0 to 7798
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        7799 non-null   object
 1   url          6115 non-null   object
 2   date_sold    7799 non-null   object
 3   price_in_US  7799 non-null   object
 4   shipping_    7799 non-null   object
 5   Text         7799 non-null   object
dtypes: object(6)
memory usage: 365.7+ KB


None

vict2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7068 entries, 0 to 7067
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Image        7068 non-null   object
 1   url          20 non-null     object
 2   date_sold    7068 non-null   object
 3   price_in_US  7068 non-null   object
 4   shipping_    7068 non-null   object
 5   Text         7068 non-null   object
dtypes: object(6)
memory usage: 331.4+ KB


None

In [53]:
for val in df_dict.values():
    val.drop('title', axis=1, inplace=True)

KeyError: "['title'] not found in axis"

In [54]:
for val in df_dict.values():
    val.rename({'Text': 'title',
                'shipping_': 'shipping_cost'},
               axis=1, inplace=True)

    val['date_sold'] = pd.to_datetime(val['date_sold'])

sold_buck = pd.concat([sold_buck1,sold_buck2])
sold_caseXX = pd.concat([sold_caseXX1,sold_caseXX2])
sold_kershaw = pd.concat([sold_kershaw1,sold_kershaw2])
sold_vict = pd.concat([sold_vict1,sold_vict2])

In [55]:
for key,val in df_dict.items():
    print(key)
    display(val.columns)

benchmade


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

buck1


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

buck2


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

case


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

caseXX1


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

caseXX2


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

crkt


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

kershaw1


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

kershaw2


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

sog


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

spyderco


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

vict1


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

vict2


Index(['Image', 'url', 'date_sold', 'price_in_US', 'shipping_cost', 'title'], dtype='object')

In [56]:
# sold_buck.drop_duplicates(
#     subset = ['date_sold', 'price_in_US', 'shipping_cost'],
#     keep = 'last', inplace=True)

# sold_caseXX.drop_duplicates(
#     subset = ['date_sold', 'price_in_US', 'shipping_cost'],
#     keep = 'last', inplace=True)

# sold_kershaw.drop_duplicates(
#     subset = ['date_sold', 'price_in_US', 'shipping_cost'],
#     keep = 'last', inplace=True)

In [57]:
sold_bench = prepare_tera_df(sold_bench, 0)
sold_buck = prepare_tera_df(sold_buck, 1)
sold_case = prepare_tera_df(sold_case, 2)
sold_caseXX = prepare_tera_df(sold_caseXX, 2)
sold_crkt = prepare_tera_df(sold_crkt, 3)
sold_kershaw = prepare_tera_df(sold_kershaw, 4)
sold_sog = prepare_tera_df(sold_sog, 5)
sold_spyd = prepare_tera_df(sold_spyd, 6)
sold_vict = prepare_tera_df(sold_vict, 7)

In [58]:
for dataframe in df_dict.values():
    dataframe['title'] = dataframe['title'].str.lower()
    dataframe['title'] = dataframe['title'].str.strip()

In [59]:
sold_df = pd.concat([sold_bench, sold_buck,
                     sold_case, sold_caseXX, 
                     sold_crkt, sold_kershaw,
                     sold_sog, sold_spyd,
                     sold_vict]) 

sold_df['brand'].value_counts()

case          28492
kershaw       19447
buck          18917
victorinox    14867
spyderco       9206
benchmade      8791
crkt           6742
sog            4858
Name: brand, dtype: int64

In [60]:
sold_df.to_csv("terapeak_data/terapeak_df.csv", index=False)

In [61]:
sold_knives = data_cleaner(sold_df).copy()
sold_knives.reset_index(drop=True, inplace=True)

In [62]:
sold_knives.brand.value_counts()

case          18918
kershaw       12957
buck          12534
victorinox     9437
spyderco       6046
benchmade      5712
crkt           4276
sog            3006
Name: brand, dtype: int64

In [63]:
sold_knives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72886 entries, 0 to 72885
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Image            72886 non-null  object        
 1   url              15070 non-null  object        
 2   date_sold        72886 non-null  datetime64[ns]
 3   price_in_US      72886 non-null  float64       
 4   shipping_cost    72886 non-null  float64       
 5   title            72886 non-null  object        
 6   converted_price  72886 non-null  float64       
 7   profit           72886 non-null  float64       
 8   ROI              72886 non-null  float64       
 9   brand            72886 non-null  object        
 10  cost             72886 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(4)
memory usage: 6.1+ MB


In [65]:
# sold_bench.to_csv("terapeak_data/tera_bench_prepared.csv", index=False)
# sold_buck.to_csv("terapeak_data/tera_buck_prepared.csv", index=False)
# sold_case.to_csv("terapeak_data/tera_case_prepared.csv", index=False)
# sold_caseXX.to_csv("terapeak_data/tera_caseXX_prepared.csv", index=False)
# sold_crkt.to_csv("terapeak_data/tera_crkt_prepared.csv", index=False)
# sold_kershaw.to_csv("terapeak_data/tera_kershaw_prepared.csv", index=False)
# sold_sog.to_csv("terapeak_data/tera_sog_prepared.csv", index=False)
# sold_spyd.to_csv("terapeak_data/tera_spyd_prepared.csv", index=False)
sold_knives.to_csv("terapeak_data/sold_df.csv", index=False)

The below block of code merged all available teraform ebay itemIds with the appropriate data. This was done in order to call the ebay Shopping API that will only accept itemIds as input. However, much of the data is older than 90 days and can no longer be accessed using the ebay Shopping API. Therefore, the teraform data will unfortunatly lack additional item specific data.  

```
teradf_benchIDs = pd.read_csv("teraform_data/tera_benchmade_itemID.csv")
teradf_buckIDs = pd.read_csv("teraform_data/tera_buck_ItemIDs.csv")
teradf_caseIDs = pd.read_csv("teraform_data/tera_case_itemIDs.csv")
teradf_kershawIDs = pd.read_csv("teraform_data/tera_kershaw_ItemIDs.csv")
teradf_sogIDs = pd.read_csv("teraform_data/tera_sog_ItemIDs.csv")
teradf_spydIDs = pd.read_csv("teraform_data/tera_spyderco_ItemIDs.csv")

dfID_list = [teradf_benchIDs,teradf_buckIDs,
             teradf_caseIDs, teradf_kershawIDs,
             teradf_sogIDs, teradf_spydIDs]

for dataframe in dfID_list:
    dataframe.rename({'Field4': 'date_sold',
                      'Data_field': 'itemID',
                      'Title': 'title'}, 
                       axis=1, inplace=True)
    
teradf_kershawIDs.rename({'item': 'title'}, 
                       axis=1, inplace=True)
                       
for dataframe in dfID_list:
    dataframe.dropna(inplace=True)
    
    
for dataframe in dfID_list:
    dataframe.rename({'Field4': 'date_sold',
                      'Data_field': 'itemID',
                      'Title': 'title'}, 
                       axis=1, inplace=True)
    dataframe.dropna(inplace=True)
    dataframe['itemID'] = dataframe['itemID'].apply(int)

teradf_kershawIDs.rename({'item': 'title'}, 
                       axis=1, inplace=True)

tera_benchIds = teradf_benchIDs.itemID.values.tolist()
tera_buckIds = teradf_buckIDs.itemID.values.tolist()
tera_caseIds = teradf_caseIDs.itemID.values.tolist()
tera_kershawIds = teradf_kershawIDs.itemID.values.tolist()
tera_sogIds = teradf_sogIDs.itemID.values.tolist()
tera_spydIds = teradf_spydIDs.itemID.values.tolist()

idMerge_bench = teradf_bench.merge(teradf_benchIDs, on='Image')
idMerge_buck = teradf_buck.merge(teradf_buckIDs)
idMerge_case = teradf_case.merge(teradf_caseIDs)
idMerge_kershaw = teradf_kershaw.merge(teradf_kershawIDs)
idMerge_spyd = teradf_spyd.merge(teradf_spydIDs)
idMerge_sog = teradf_sog.merge(teradf_sogIDs)

# idMerge_bench.to_csv('teraform_data/tera_bench_idMerge.csv', index=False)
# idMerge_buck.to_csv('teraform_data/tera_buck_idMerge.csv', index=False)
# idMerge_case.to_csv('teraform_data/tera_case_idMerge.csv', index=False)
# idMerge_kershaw.to_csv('teraform_data/tera_kershaw_idMerge.csv', index=False)
# idMerge_spyd.to_csv('teraform_data/tera_spyd_idMerge.csv', index=False)
# idMerge_sog.to_csv('teraform_data/tera_sog_idMerge.csv', index=False)
```

```
#Create row for converted Price of Knives in US dollars
price_list = []
for row in full_dataset:
    listed_price = np.float(row['sellingStatus']['convertedCurrentPrice']['value'])
    price_list.append(listed_price)
    
df['price_in_US'] = price_list
```

```
#atttempt to pull shipping cost from json dict
shipping_cost_list = []
for row in full_dataset:
    shipping_cost = np.float(row['shippingInfo']['shippingServiceCost']['value'])
    shipping_cost_list.append(shipping_cost)
    
df['shipping_price'] = shipping_cost_list
```

```
#pull shipping cost from json dict with regex 
df['shipping_cost'] = df['shippingInfo'].apply(lambda x: re.findall("(\d+\S+\d)", json.dumps(x)))
df['shipping_cost'] = df['shipping_cost'].apply(lambda x: ''.join(x))
df.drop(df[df['shipping_cost'] == ''].index, inplace=True)
df['shipping_cost'] = df['shipping_cost'].apply(lambda x: np.float(x))

#create new feature 'converted price'
df['converted_price'] = df['shipping_cost'] + df['price_in_US']
df = df.drop_duplicates(subset=['title', 'galleryURL'], keep='first')
display(df.head())
display(df.info())

df.to_csv('data/full_dataset.csv', index=False)
```