In [1]:
from ebaysdk.finding import Connection
import requests
from ebaysdk.shopping import Connection as Shopping
import pandas as pd 
import  json
import numpy as np
import re
# import preprocess_ddey117 as pp
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
import ast

import seaborn as sns 

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def prepare_df(df):
    price_list = []
    ship_price_list = []
    condition_list = []
    condition = None
    for row in full_dataset:
        listed_price = float(row['sellingStatus']['convertedCurrentPrice']['value'])
        price_list.append(listed_price)
     
        try:
            listed_ship_price = float(row['shippingInfo']['shippingServiceCost']['value'])
            ship_price_list.append(listed_ship_price)
        except: 
            listed_ship_price = 0
            ship_price_list.append(listed_ship_price)

        try:
            condition = float(row['condition']['conditionId'])
            condition_list.append(condition)
        except: 
            conditon = 0
            condition_list.append(condition)

    df['shipping_cost'] = ship_price_list
    df['price_in_US'] = price_list
    df['condition'] = condition_list
    
    #create new feature 'converted price'
    df['converted_price'] = df['shipping_cost'] + df['price_in_US']
    df.drop_duplicates(subset=['itemId'],  keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df


bucket_dict = {'benchmade': 45.0,
               'buck': 20.0,
               'case': 20.0,
               'crkt': 15.0,
               'kershaw': 15.0,
               'leatherman': 30.0,
               'sog': 15.0,
               'spyderco': 30.0,
               'victorinox': 20.0
              }


def prepare_brands(df, bucket_dict_position, overhead_cost=3):

    df.title = df.title.apply(str.lower)
 
    #remove special characters
#     df.title.apply(pp.remove_special_chars)
    df['brand'] = str(list(bucket_dict.keys())[bucket_dict_position])
    df['cost'] = float(list(bucket_dict.values())[bucket_dict_position])
    df['profit'] = (df['converted_price'] -  df['cost'] - overhead_cost)
    df['ROI'] = (df['profit']/( df['cost'] + overhead_cost))*100.0
    
    return df

def prepare_data(data_list):
    """
    This function takes in a list of dictionaries and prepares it
    for analysis
    """
    
    # Make a new list to hold results
    results = []
    
    for business_data in data_list:
    
        # Make a new dictionary to hold prepared data for this business
        prepared_data = {}
        
        # Extract name, review_count, rating, and price key-value pairs
        # from business_data and add to prepared_data
        # If a key is not present in business_data, add it to prepared_data
        # with an associated value of None
        
        keys = ['itemId', 'title', 'galleryURL', 
                'viewItemURL', 'autoPay', 'postalCode', 
                'sellingStatus', 'shippingInfo', 'listingInfo',
                'returnsAccepted', 'condition', 'topRatedListing',
                'galleryPlusPictureURL','pictureURLLarge', 
                'pictureURLSuperSize']
        
        for key in keys:
            prepared_data[key] = business_data.get(key, None)
            results.append(prepared_data)
    
       
        # Add to list if all values are present
#         if all(prepared_data.values()):
#             results.append(prepared_data)
    
    
    return results

def knife_request(Brand, dict_pos):
    api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

    request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': Brand}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': 1

                                    },

                }

    #     request['paginationInput']['pageNumber'] = page

    response = api.execute('findItemsAdvanced', request)


    response_pages = response.dict()

    full_dataset = []
    
    total_pages = int(response_pages['paginationOutput']['totalPages'])

    if total_pages > 100:
        pages_to_request = 100
        
    else:
        pages_to_request = total_pages - 1
        
        

    for page in range(1, pages_to_request):
        # Add or update the "offset" key-value pair in url_params

        # Make the query and get the response

        api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

        request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': Brand}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': page

                                    },

                }


        response = api.execute('findItemsAdvanced', request)

        #save the response as a json dict
        response_dict = response.dict()


        #index dict to appropriate index
        results_list_of_dicts = response_dict['searchResult']['item']

        # Call the prepare_data function to get a list of processed data
        prepared_knives = prepare_data(results_list_of_dicts)

        # Extend full_dataset with this list (don't append, or you'll get
        # a list of lists instead of a flat list)
        full_dataset.extend(prepared_knives)

    # Check the length of the full dataset. It will be up to `total`,
    # potentially less if there were missing values
    display(len(full_dataset))
    
    df = pd.DataFrame(full_dataset)
    
    price_list = []
    ship_price_list = []
    condition_list = []
    condition = None
    for row in full_dataset:
        try:
            listed_price = float(row['sellingStatus']['convertedCurrentPrice']['value'])
            price_list.append(listed_price)
        except:
            listed_price = "Na"
            price_list.append(listed_price)
        try:
            listed_ship_price = float(row['shippingInfo']['shippingServiceCost']['value'])
            ship_price_list.append(listed_ship_price)
        except: 
            listed_ship_price = 0
            ship_price_list.append(listed_ship_price)
        try:
            condition = float(row['condition']['conditionId'])
            condition_list.append(condition)
        except: 
            conditon = 0
            condition_list.append(condition)

    df['shipping_cost'] = ship_price_list
    df['price_in_US'] = price_list
    df['condition'] = condition_list
    
    #create new feature 'converted price'
    df['converted_price'] = df['shipping_cost'] + df['price_in_US']
    df.drop_duplicates(subset=['itemId'],  keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    df = prepare_brands(df, dict_pos)
    
    return df

def prepare_dataIds(data_list):
    """
    This function takes in a list of dictionaries and prepares it
    for analysis
    """
    
    # Make a new list to hold results
    results = []
    
    for business_data in data_list:
    
        # Make a new dictionary to hold prepared data for this business
        prepared_data = {}
        
        # Extract name, review_count, rating, and price key-value pairs
        # from business_data and add to prepared_data
        # If a key is not present in business_data, add it to prepared_data
        # with an associated value of None
        
        keys = ['ItemID','GalleryURL','PictureURL',
                'Location','ConvertedCurrentPrice',
                'Title','ItemSpecifics', 
                'Country','ConditionID']
        
        for key in keys:
            prepared_data[key] = business_data.get(key, None)
            results.append(prepared_data)
    
       
        # Add to list if all values are present
#         if all(prepared_data.values()):
#             results.append(prepared_data)
    
    
    return results

def process_list(my_list):
 
    api = Shopping(config_file='ebay.yaml', debug=False, siteid="EBAY-US")
    request = {
               'itemID': my_list,
               'IncludeSelector': 'ItemSpecifics'
              }
    response = api.execute('GetMultipleItems', request)

    

    #save the response as a json dict
    response_dict = response.dict()



    #index dict to appropriate index
    results_list_of_dicts = response_dict['Item']

    # Call the prepare_data function to get a list of processed data
    prepared_knives = prepare_dataIds(results_list_of_dicts)

    # Extend full_dataset with this list (don't append, or you'll get
    # a list of lists instead of a flat list)
    full_dataset.extend(prepared_knives)
    
    return full_dataset

bucket_dict = {'benchmade': 45.0,
               'buck': 20.0,
               'case': 20.0,
               'crkt': 15.0,
               'kershaw': 15.0,
               'leatherman': 30.0, 
               'sog': 15.0,
               'spyderco': 30.0,
               'victorinox': 20.0
              }

#x = position of bucket_dictionary
def prepare_tera_df(df, x, overhead_cost=3):
    df['price_in_US'] = df['price_in_US'].str.replace("$", "")
    df['price_in_US'] = df['price_in_US'].str.replace(",", "")
    df['price_in_US'] = df['price_in_US'].apply(float)
    
    df['shipping_cost'] = df['shipping_cost'].str.replace("$", "")
    df['shipping_cost'] = df['shipping_cost'].str.replace(",", "")
    df['shipping_cost'] = df['shipping_cost'].apply(float)
    
    df['converted_price'] = (df['price_in_US'] + df['shipping_cost'])
    
    df['profit'] = (df['converted_price'] - list(bucket_dict.values())[x] - overhead_cost)
    df['ROI'] = (df['profit']/(list(bucket_dict.values())[x]))*100.0
    
    df['brand'] = list(bucket_dict.keys())[x]
    df['cost'] = list(bucket_dict.values())[x]

    
    return df


def fix(col):
    dd = dict()
    for d in col:
        values = list(d.values())
        if len(values) == 2:
            dd[values[0]] = values[1]
    return dd


def transform_item_specifics(df, perc=90.0):

    df.dropna(subset=['ItemSpecifics'], inplace=True)
    df['ItemSpecifics'] = df['ItemSpecifics'].apply(lambda x: ast.literal_eval(x))
    df['item_list'] = df['ItemSpecifics'].apply(lambda x: x['NameValueList'])

    df['ItemSpecifics'] = df['ItemSpecifics'].apply(lambda x: [x['NameValueList']] if isinstance(x['NameValueList'], dict) else x['NameValueList'])

    df['ItemSpecifics'] = df['ItemSpecifics'].apply(fix)

    df = pd.json_normalize(df['ItemSpecifics'])

    min_count =  int(((100-perc)/100)*df.shape[0] + 1)
    mod_df = df.dropna(axis=1, 
                       thresh=min_count)

    return mod_df


def data_cleaner(df):
    lot = re.compile('(?<!-\S)lot(?![^\s.,:?!])')
    disp = re.compile('(display)')
    box = re.compile('(box)')
    group = re.compile('(group)')
    is_set = re.compile('(?<!-\S)set(?![^\s.,?!])')
    df['title'] = df['title'].str.lower()
    trim_list = [lot,disp,box,group,is_set]
    for item in trim_list:
        df.loc[df['title'].apply(lambda x: re.search(item, x)).notnull(), 'trim'] = 1 
    to_drop = df.loc[df['trim'] == 1].index
    df.drop(to_drop, inplace=True)
    df.drop('trim', axis=1, inplace=True)
    
    return df

In [4]:
# bench_df = knife_request('Benchmade', 0)
# buck_df = knife_request('Buck', 1)
# case_df = knife_request('Case', 2)
# df_caseXX = knife_request('Case XX', 2)
# df_crkt = knife_request("CRKT", 3)
# # df_leatherman = knife_request('Leatherman', 5)
# df_sog = knife_request('SOG', 6)
# df_spyderco = knife_request('Spyderco', 7)

Beginning of API calls for listed data. To be merged with item specific data using ebay itemIds.

### Domain Understading: Cost Breakdown
- padded envelopes: \$0.50 per knife
- flatrate shipping: \$4.45 per knife
- brand knife at surplus store: 15, 20, 30, or 45 dollars per knife
- overhead expenses (gas, cleaning suplies, sharpening supplies, etc): $7

Running functions to call the Finding API and return datasets for cat () knives for sale listed on ebay in the last 90 days. (explain how ebay rules work)

```
bench_df = knife_request('Benchmade', 0)
buck_df = knife_request('Buck', 1)
case_df = knife_request('Case', 2)
df_caseXX = knife_request('Case XX', 2)
df_crkt = knife_request("CRKT", 3)
df_leatherman = knife_request('Leatherman', 5)
df_sog = knife_request('SOG', 6)
df_spyderco = knife_request('Spyderco', 7)


bench_df.to_csv('data/df_bench1.csv', index=False)
buck_df.to_csv('data/df_buck.csv', index=False)
case_df.to_csv('data/df_case.csv', index=False)
df_caseXX.to_csv('data/df_CaseXX.csv', index=False)
df_crkt.to_csv('data/df_crkt.csv', index=False)
df_leatherman.to_csv('data/df_leatherman.csv', index=False)
df_sog.to_csv('data/df_sog.csv', index=False)
df_spyderco.to_csv('data/df_spyderco.csv', index=False)
```

Kershaw and victorinox data was requested using the FindingAPI below after tweaking some pagination through trial and error to maximize data.

```
full_dataset = []
for page in range(1, 57):
#         # Add or update the "offset" key-value pair in url_params

#         # Make the query and get the response

    api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

    request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': 'Kershaw'}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': page

                                    },

                }

        #     request['paginationInput']['pageNumber'] = page

    response = api.execute('findItemsAdvanced', request)

    #save the response as a json dict
    response_dict = response.dict()

    #index dict to appropriate index
    results_list_of_dicts = response_dict['searchResult']['item']

    # Call the prepare_data function to get a list of processed data
    prepared_knives = prepare_data(results_list_of_dicts)

    # Extend full_dataset with this list (don't append, or you'll get
    # a list of lists instead of a flat list)
    full_dataset.extend(prepared_knives)

    # Check the length of the full dataset. It will be up to `total`,
    # potentially less if there were missing values

    df = pd.DataFrame(full_dataset)
    
df_kershaw = prepare_df(df)
df_kershaw = prepare_brands(df_kershaw, 4)
df_kershaw.to_csv('data/df_kershaw.csv', index=False)

```


```
full_dataset = []
for page in range(1, 86):

    api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")

    request = {
                'categoryId': 48818,
                'itemFilter': [
                                {'name': 'ListingType', 'value': 'FixedPrice'}
                              ],
                'aspectFilter': [
                                  {'aspectName': 'Brand', 'aspectValueName': 'Victorinox'}],

                'outputSelector': ['PictureURLLarge', 'PictureURLSuperSize'],


                'paginationInput': {
                                    'entriesPerPage': 100,
                                    'pageNumber': page

                                    },

                }

    response = api.execute('findItemsAdvanced', request)

    response_dict = response.dict()

    results_list_of_dicts = response_dict['searchResult']['item']

    prepared_knives = prepare_data(results_list_of_dicts)

    full_dataset.extend(prepared_knives)
    
df_victorinox = pd.DataFrame(full_dataset)
df_victorinox = prepare_df(df_victorinox)
df_victorinox = prepare_brands(df_victorinox, 8)
df_victorinox.to_csv('data/df_victorinox.csv', index=False)
```

end of API call for listed data. 

start of API call section using IDs from preview listed datasets to get Item Specific data from ebay. This will return more descriptive information about the knives, pulling from a container on the website that sellers must complete to post a listing. 

In [5]:
df_bench = pd.read_csv("listed_data/df_bench.csv")
df_buck = pd.read_csv("listed_data/df_buck.csv")
df_case = pd.read_csv("listed_data/df_case.csv")
df_caseXX = pd.read_csv("listed_data/df_CaseXX.csv")
df_crkt = pd.read_csv("listed_data/df_crkt.csv")
df_kershaw = pd.read_csv("listed_data/df_kershaw.csv")
# df_leatherman = pd.read_csv("listed_data/df_leatherman.csv")
df_sog = pd.read_csv("listed_data/df_sog.csv")
df_spyderco = pd.read_csv("listed_data/df_spyderco.csv")
# df_victorinox = pd.read_csv("listed_data/df_victorinox.csv")

In [6]:
benchIds = df_bench.itemId.values.tolist()
buckIds = df_buck.itemId.values.tolist()
caseIds = df_case.itemId.values.tolist()
caseXXIds = df_caseXX.itemId.values.tolist()
crktIds = df_crkt.itemId.values.tolist()
kershawIds = df_kershaw.itemId.values.tolist()
# leathIds = df_leatherman.itemId.values.tolist()
sogIds = df_sog.itemId.values.tolist()
spydIds = df_spyderco.itemId.values.tolist()
# victIds = df_victorinox.itemId.values.tolist()

In [8]:
#ShoppingAPI call to return benchmade item specific data.

full_dataset = []
for i in range(0, len(benchIds), 20):
    process_list(benchIds[i:i+20])

bench = pd.DataFrame(full_dataset)
bench.drop_duplicates(subset=['ItemID'], inplace=True)
bench.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1658 entries, 0 to 14913
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 1658 non-null   object
 1   GalleryURL             1643 non-null   object
 2   PictureURL             1658 non-null   object
 3   Location               1658 non-null   object
 4   ConvertedCurrentPrice  1658 non-null   object
 5   Title                  1658 non-null   object
 6   ItemSpecifics          1647 non-null   object
 7   Country                1658 non-null   object
 8   ConditionID            1593 non-null   object
dtypes: object(9)
memory usage: 129.5+ KB


In [9]:
# ShoppingAPI call to return buck item specific data.
full_dataset = []
for i in range(0, len(buckIds), 20):
    process_list(buckIds[i:i+20])

buck = pd.DataFrame(full_dataset)
buck.drop_duplicates(subset=['ItemID'], inplace=True)
buck.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2707 entries, 0 to 24354
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 2707 non-null   object
 1   GalleryURL             2689 non-null   object
 2   PictureURL             2707 non-null   object
 3   Location               2707 non-null   object
 4   ConvertedCurrentPrice  2707 non-null   object
 5   Title                  2707 non-null   object
 6   ItemSpecifics          2669 non-null   object
 7   Country                2707 non-null   object
 8   ConditionID            2385 non-null   object
dtypes: object(9)
memory usage: 211.5+ KB


In [10]:
# ShoppingAPI call to return case brand item specific data.

full_dataset = []
for i in range(0, len(caseIds), 20):
    process_list(caseIds[i:i+20])

df_case = pd.DataFrame(full_dataset)
df_case.drop_duplicates(subset=['ItemID'], inplace=True)
df_case.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9040 entries, 0 to 81351
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 9040 non-null   object
 1   GalleryURL             8933 non-null   object
 2   PictureURL             9038 non-null   object
 3   Location               9040 non-null   object
 4   ConvertedCurrentPrice  9040 non-null   object
 5   Title                  9040 non-null   object
 6   ItemSpecifics          8885 non-null   object
 7   Country                9040 non-null   object
 8   ConditionID            7643 non-null   object
dtypes: object(9)
memory usage: 706.2+ KB


In [11]:
# ShoppingAPI call to return caseXX brand item specific data.

full_dataset = []
for i in range(0, len(caseXXIds), 20):
    process_list(caseXXIds[i:i+20])

df_caseXX = pd.DataFrame(full_dataset)
df_caseXX.drop_duplicates(subset=['ItemID'], inplace=True)
df_caseXX.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7406 entries, 0 to 66645
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 7406 non-null   object
 1   GalleryURL             7400 non-null   object
 2   PictureURL             7406 non-null   object
 3   Location               7406 non-null   object
 4   ConvertedCurrentPrice  7406 non-null   object
 5   Title                  7406 non-null   object
 6   ItemSpecifics          7260 non-null   object
 7   Country                7406 non-null   object
 8   ConditionID            6879 non-null   object
dtypes: object(9)
memory usage: 578.6+ KB


In [12]:
# ShoppingAPI call to return crkt item specific data.

full_dataset = []
for i in range(0, len(crktIds), 20):
    process_list(crktIds[i:i+20])

crkt = pd.DataFrame(full_dataset)
crkt.drop_duplicates(subset=['ItemID'], inplace=True)
crkt.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1423 entries, 0 to 12798
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 1423 non-null   object
 1   GalleryURL             1410 non-null   object
 2   PictureURL             1423 non-null   object
 3   Location               1423 non-null   object
 4   ConvertedCurrentPrice  1423 non-null   object
 5   Title                  1423 non-null   object
 6   ItemSpecifics          1394 non-null   object
 7   Country                1423 non-null   object
 8   ConditionID            1367 non-null   object
dtypes: object(9)
memory usage: 111.2+ KB


In [16]:
# ShoppingAPI call to return kershaw item specific data.

full_dataset = []
for i in range(0, len(kershawIds), 20):
    process_list(kershawIds[i:i+20])

kershaw = pd.DataFrame(full_dataset)
kershaw.drop_duplicates(subset=['ItemID'], inplace=True)
kershaw.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5295 entries, 0 to 47646
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 5295 non-null   object
 1   GalleryURL             5225 non-null   object
 2   PictureURL             5295 non-null   object
 3   Location               5295 non-null   object
 4   ConvertedCurrentPrice  5295 non-null   object
 5   Title                  5295 non-null   object
 6   ItemSpecifics          5247 non-null   object
 7   Country                5295 non-null   object
 8   ConditionID            5145 non-null   object
dtypes: object(9)
memory usage: 413.7+ KB


ShoppingAPI call to return leatherman item specific data.
```
full_dataset = []
for i in range(0, len(leathIds), 20):
    process_list(leathIds[i:i+20])

leath = pd.DataFrame(full_dataset)
leath.drop_duplicates(subset=['ItemID'], inplace=True)
leath.info()
```

In [17]:
# ShoppingAPI call to return SOG item specific data.

full_dataset = []
for i in range(0, len(sogIds), 20):
    process_list(sogIds[i:i+20])

sog = pd.DataFrame(full_dataset)
sog.drop_duplicates(subset=['ItemID'], inplace=True)
sog.info()


KeyError: 'access_token'

In [18]:
sog = pd.DataFrame(full_dataset)
sog.drop_duplicates(subset=['ItemID'], inplace=True)
sog.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380 entries, 0 to 3411
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ItemID                 380 non-null    object
 1   GalleryURL             375 non-null    object
 2   PictureURL             380 non-null    object
 3   Location               380 non-null    object
 4   ConvertedCurrentPrice  380 non-null    object
 5   Title                  380 non-null    object
 6   ItemSpecifics          374 non-null    object
 7   Country                380 non-null    object
 8   ConditionID            372 non-null    object
dtypes: object(9)
memory usage: 29.7+ KB


In [20]:
# ShoppingAPI call to return spyderco item specific data.

# full_dataset = []
# for i in range(0, len(spydIds), 20):
#     process_list(spydIds[i:i+20])

KeyError: 'access_token'

ShoppingAPI call to return victorinox item specific data.
```
full_dataset = []
for i in range(0, len(victIds), 20):
    process_list(victIds[i:i+20])
    
vict = pd.DataFrame(full_dataset)
vict.drop_duplicates(subset=['ItemID'], inplace=True)
vict.info()
```

```
bench.to_csv("listed_data/benchIds.csv", index=False)
buck.to_csv("listed_data/buckIds.csv", index=False)
df_case.to_csv("listed_data/caseIds.csv", index=False)
df_caseXX.to_csv("listed_data/caseXXIds.csv", index=False)
crkt.to_csv("listed_data/crktIds.csv", index=False)
kershaw.to_csv("listed_data/kershawIds.csv", index=False)
leath.to_csv("listed_data/leathIds.csv", index=False)
sog.to_csv("listed_data/sogIds.csv", index=False)
spyd.to_csv("listed_data/spydIds.csv", index=False)
vict.to_csv("listed_data/victIds.csv", index=False)
```

Beginning of prep to merge original listed data with item specific data requested using a seperate API for more complete details about all listings gathered.

In [None]:
bench = pd.read_csv("listed_data/benchIds.csv")
buck = pd.read_csv("listed_data/buckIds.csv")
case = pd.read_csv("listed_data/caseIds.csv")
caseXX = pd.read_csv("listed_data/caseXXIds.csv")
crkt = pd.read_csv("listed_data/crktIds.csv")
kershaw = pd.read_csv("listed_data/kershawIds.csv")
# leath = pd.read_csv("listed_data/leathIds.csv")
sog = pd.read_csv("listed_data/sogIds.csv")
spyd = pd.read_csv("listed_data/spydIds.csv")
# vict = pd.read_csv("listed_data/victIds.csv")

In [None]:
df_list = [bench,buck,
           case,caseXX,
           crkt,kershaw,
           leath,sog,
           spyd,vict]

for dataframe in df_list:
    dataframe.rename({'Title': 'title',
                      'ItemID': 'itemId'},
                     axis=1,inplace=True)
    
    dataframe.drop(['ConditionID','ConvertedCurrentPrice'], 
                   axis=1, inplace=True)
    dataframe['title'] = dataframe['title'].str.lower()

In [None]:
#merge Item Specific dataframes with original listed data using itemIds and title
bench_merged = df_bench.merge(bench)
buck_merged = df_buck.merge(buck)
case_merged = df_case.merge(case)
caseXX_merged = df_caseXX.merge(caseXX)
crkt_merged = df_crkt.merge(crkt)
kershaw_merged = df_kershaw.merge(kershaw)
# leath_merged = df_leatherman.merge(leath)
spyd_merged = df_spyderco.merge(spyd)
sog_merged = df_sog.merge(sog)
# vict_merged = df_victorinox.merge(vict)

In [None]:
bench_spec = transform_item_specifics(bench_merged, perc=75.0)
buck_spec = transform_item_specifics(buck_merged, perc=75.0)
case_spec = transform_item_specifics(case_merged, perc=75.0)
caseXX_spec = transform_item_specifics(caseXX_merged, perc=75.0)
crkt_spec = transform_item_specifics(crkt_merged, perc=75.0)
kershaw_spec = transform_item_specifics(kershaw_merged, perc=75.0)
# leath_spec = transform_item_specifics(leath_merged, perc=75.0)
sog_spec = transform_item_specifics(sog_merged, perc=75.0)
spyd_spec = transform_item_specifics(spyd_merged, perc=75.0)
# vict_spec = transform_item_specifics(vict_merged, perc=75.0)

In [None]:
specs_list = [bench_spec, buck_spec,
              case_spec, caseXX_spec,
              crkt_spec, kershaw_spec,
              leath_spec, sog_spec,
              spyd_spec, vict_spec]

In [None]:
for dataframe in specs_list:
    dataframe.info()

In [None]:

for dataframe in specs_list:
    dataframe.rename({'Brand': 'specBrand'}, axis=1, inplace=True)

In [None]:
tot_bench = bench_merged.join(bench_spec)
tot_buck = buck_merged.join(buck_spec)
tot_case = case_merged.join(case_spec)
tot_caseXX = caseXX_merged.join(caseXX_spec)
tot_crkt = crkt_merged.join(crkt_spec)
tot_kershaw = kershaw_merged.join(kershaw_spec)
tot_leath = leath_merged.join(leath_spec)
tot_sog = sog_merged.join(sog_spec)
tot_spyd = spyd_merged.join(spyd_spec)
tot_vict = vict_merged.join(vict_spec)

In [None]:
tot_bench.to_csv('listed_data/total_list_bench.csv', index=False)
tot_buck.to_csv('listed_data/total_list_buck.csv', index=False)
tot_case.to_csv('listed_data/total_list_case.csv', index=False)
tot_caseXX.to_csv('listed_data/total_list_caseXX.csv', index=False)
tot_crkt.to_csv('listed_data/total_list_crkt.csv', index=False)
tot_kershaw.to_csv('listed_data/total_list_kershaw.csv', index=False)
tot_leath.to_csv('listed_data/total_list_leath.csv', index=False)
tot_sog.to_csv('listed_data/total_list_sog.csv', index=False)
tot_spyd.to_csv('listed_data/total_list_spyd.csv', index=False)
tot_vict.to_csv('listed_data/total_list_vict.csv', index=False)

In [None]:
# tot_bench = pd.read_csv('listed_data/total_list_bench.csv')
# tot_buck = pd.read_csv('listed_data/total_list_buck.csv')
# tot_case = pd.read_csv('listed_data/total_list_case.csv')
# tot_caseXX = pd.read_csv('listed_data/total_list_caseXX.csv')
# tot_crkt = pd.read_csv('listed_data/total_list_crkt.csv')
# tot_kershaw = pd.read_csv('listed_data/total_list_kershaw.csv')
# tot_leath = pd.read_csv('listed_data/total_list_leath.csv')
# tot_sog = pd.read_csv('listed_data/total_list_sog.csv')
# tot_spyd = pd.read_csv('listed_data/total_list_spyd.csv')
# tot_vict = pd.read_csv('listed_data/total_list_vict.csv')

In [None]:
listed_knives = pd.concat([tot_bench, tot_buck,
                           tot_case, tot_caseXX,
                           tot_crkt, tot_kershaw,
                           tot_sog,tot_spyd])

listed_knives = data_cleaner(listed_knives).copy()
listed_knives.reset_index(drop=True, inplace=True)

In [None]:
listed_knives.drop(['sellingStatus', 'shippingInfo', 
                    'galleryPlusPictureURL', 
                    'GalleryURL', 'ItemSpecifics', 
                    'item_list', 'listingInfo', 
                    'Vintage', 'Modified Item'], 
                    axis=1, inplace=True)

In [None]:
listed_knives.to_csv("listed_data/listed_knives_df.csv", index=False)

In [None]:
teradf_bench = pd.read_csv("teraform_data/bench_scraped.csv")
teradf_buck1 = pd.read_csv("teraform_data/buck_scraped.csv")
teradf_buck2 = pd.read_csv("teraform_data/buck_reversed.csv")
teradf_case = pd.read_csv("teraform_data/case_scraped_Copy.csv")
teradf_caseXX1 = pd.read_csv("teraform_data/caseXX_scraped.csv")
teradf_caseXX2 = pd.read_csv("teraform_data/caseXX_reversed.csv")
teradf_crkt = pd.read_csv("teraform_data/crkt_scraped.csv")
teradf_kershaw = pd.read_csv("teraform_data/Kershaw_scraped.csv")
teradf_sog = pd.read_csv("teraform_data/SOG_scraped.csv")
teradf_spyd = pd.read_csv("teraform_data/spyd_scraped.csv")

In [None]:
for dataframe in [teradf_buck2,teradf_caseXX2]:
    dataframe.drop(['Field4', 'shipping_'], 
                   axis=1, inplace=True)

In [None]:
df_dict = {'benchmade': teradf_bench, 
           'buck1': teradf_buck1,
           'buck2': teradf_buck2,
           'case':teradf_case,
           'caseXX1':teradf_caseXX1,
           'caseXX2':teradf_caseXX2,
           'crkt':teradf_crkt,
           'kershaw':teradf_kershaw,
           'sog':teradf_sog, 
           'spyderco':teradf_spyd}
          

for key,val in df_dict.items():
    print(key)
    display(val.head())

In [None]:
for val in df_dict.values():
    val.rename({'Field4': 'date_sold',
                'shipping_': 'shipping_cost'},
               axis=1, inplace=True)

In [None]:
for key,val in df_dict.items():
    print(key)
    display(val.columns)

In [None]:
for val in df_dict.values():
    val['date_sold'] = pd.to_datetime(val['date_sold'])

In [None]:
teradf_buck = pd.concat([teradf_buck1,teradf_buck2])
teradf_caseXX = pd.concat([teradf_caseXX1,teradf_caseXX2])

In [None]:
teradf_buck.drop_duplicates(
    subset = ['date_sold', 'price_in_US', 'shipping_cost'],
    keep = 'last', inplace=True)

teradf_case.drop_duplicates(
    subset = ['date_sold', 'price_in_US', 'shipping_cost'],
    keep = 'last', inplace=True)

In [None]:
bucket_dict

In [None]:
teradf_bench =  prepare_tera_df(teradf_bench, 0)
teradf_buck =  prepare_tera_df(teradf_buck, 1)
teradf_case =  prepare_tera_df(teradf_case, 2)
teradf_caseXX =  prepare_tera_df(teradf_caseXX, 2)
teradf_crkt =  prepare_tera_df(teradf_crkt, 3)
teradf_kershaw =  prepare_tera_df(teradf_kershaw, 4)
teradf_sog =  prepare_tera_df(teradf_sog, 6)
teradf_spyd =  prepare_tera_df(teradf_spyd, 7)

In [None]:
# teradf_bench = prepare_tera_df(teradf_bench, 0)
# teradf_buck = prepare_tera_df(teradf_buck, 1)
# teradf_case = prepare_tera_df(teradf_case, 2)
# teradf_crkt = prepare_tera_df(teradf_crkt, 3)
# teradf_kershaw = prepare_tera_df(teradf_kershaw, 4)
# teradf_leath = prepare_tera_df(teradf_leath, 5)
# teradf_sog = prepare_tera_df(teradf_sog, 6)
# teradf_spyd = prepare_tera_df(teradf_spyd, 7)
# teradf_vict = prepare_tera_df(teradf_vict, 8)

In [None]:
for dataframe in df_dict.values():
    dataframe['title'] = dataframe['title'].str.lower()
    dataframe['title'] = dataframe['title'].str.strip()

In [None]:
tera_df = pd.concat([teradf_bench, teradf_buck,
                     teradf_case, teradf_caseXX, 
                     teradf_crkt, teradf_kershaw,
                     teradf_sog, teradf_spyd])     

In [None]:
tera_df.info()

In [None]:
tera_df['brand'].value_counts()

Beginning of API calls for listed data. To be merged with item specific data using ebay itemIds.

In [None]:
# teradf_bench.to_csv("teraform_data/tera_bench_prepared.csv", index=False)
# teradf_buck.to_csv("teraform_data/tera_buck_prepared.csv", index=False)
# teradf_case.to_csv("teraform_data/tera_case_prepared.csv", index=False)
# teradf_crkt.to_csv("teraform_data/tera_crkt_prepared.csv", index=False)
# teradf_kershaw.to_csv("teraform_data/tera_kershaw_prepared.csv", index=False)
# teradf_leath.to_csv("teraform_data/tera_leatherman_prepared.csv", index=False)
# teradf_sog.to_csv("teraform_data/tera_sog_prepared.csv", index=False)
# teradf_spyd.to_csv("teraform_data/tera_spyd_prepared.csv", index=False)
# teradf_vict.to_csv("teraform_data/tera_victorinox_prepared.csv", index=False)
# tera_df.to_csv("teraform_data/teraform_df.csv", index=False)

In [None]:
# teradf_bench = pd.read_csv("teraform_data/tera_bench_prepared.csv")
# teradf_buck = pd.read_csv("teraform_data/tera_buck_prepared.csv")
# teradf_case = pd.read_csv("teraform_data/tera_case_prepared.csv")
# teradf_crkt = pd.read_csv("teraform_data/tera_crkt_prepared.csv")
# teradf_kershaw = pd.read_csv("teraform_data/tera_kershaw_prepared.csv")
# teradf_leath = pd.read_csv("teraform_data/tera_leatherman_prepared.csv")
# teradf_sog = pd.read_csv("teraform_data/tera_sog_prepared.csv")
# teradf_spyd = pd.read_csv("teraform_data/tera_spyd_prepared.csv")
# teradf_vict = pd.read_csv("teraform_data/tera_victorinox_prepared.csv")

The below block of code merged all available teraform ebay itemIds with the appropriate data. This was done in order to call the ebay Shopping API that will only accept itemIds as input. However, much of the data is older than 90 days and can no longer be accessed using the ebay Shopping API. Therefore, the teraform data will unfortunatly lack additional item specific data.  

```
teradf_benchIDs = pd.read_csv("teraform_data/tera_benchmade_itemID.csv")
teradf_buckIDs = pd.read_csv("teraform_data/tera_buck_ItemIDs.csv")
teradf_caseIDs = pd.read_csv("teraform_data/tera_case_itemIDs.csv")
teradf_kershawIDs = pd.read_csv("teraform_data/tera_kershaw_ItemIDs.csv")
teradf_sogIDs = pd.read_csv("teraform_data/tera_sog_ItemIDs.csv")
teradf_spydIDs = pd.read_csv("teraform_data/tera_spyderco_ItemIDs.csv")

dfID_list = [teradf_benchIDs,teradf_buckIDs,
             teradf_caseIDs, teradf_kershawIDs,
             teradf_sogIDs, teradf_spydIDs]

for dataframe in dfID_list:
    dataframe.rename({'Field4': 'date_sold',
                      'Data_field': 'itemID',
                      'Title': 'title'}, 
                       axis=1, inplace=True)
    
teradf_kershawIDs.rename({'item': 'title'}, 
                       axis=1, inplace=True)
                       
for dataframe in dfID_list:
    dataframe.dropna(inplace=True)
    
    
for dataframe in dfID_list:
    dataframe.rename({'Field4': 'date_sold',
                      'Data_field': 'itemID',
                      'Title': 'title'}, 
                       axis=1, inplace=True)
    dataframe.dropna(inplace=True)
    dataframe['itemID'] = dataframe['itemID'].apply(int)

teradf_kershawIDs.rename({'item': 'title'}, 
                       axis=1, inplace=True)

tera_benchIds = teradf_benchIDs.itemID.values.tolist()
tera_buckIds = teradf_buckIDs.itemID.values.tolist()
tera_caseIds = teradf_caseIDs.itemID.values.tolist()
tera_kershawIds = teradf_kershawIDs.itemID.values.tolist()
tera_sogIds = teradf_sogIDs.itemID.values.tolist()
tera_spydIds = teradf_spydIDs.itemID.values.tolist()

idMerge_bench = teradf_bench.merge(teradf_benchIDs, on='Image')
idMerge_buck = teradf_buck.merge(teradf_buckIDs)
idMerge_case = teradf_case.merge(teradf_caseIDs)
idMerge_kershaw = teradf_kershaw.merge(teradf_kershawIDs)
idMerge_spyd = teradf_spyd.merge(teradf_spydIDs)
idMerge_sog = teradf_sog.merge(teradf_sogIDs)

# idMerge_bench.to_csv('teraform_data/tera_bench_idMerge.csv', index=False)
# idMerge_buck.to_csv('teraform_data/tera_buck_idMerge.csv', index=False)
# idMerge_case.to_csv('teraform_data/tera_case_idMerge.csv', index=False)
# idMerge_kershaw.to_csv('teraform_data/tera_kershaw_idMerge.csv', index=False)
# idMerge_spyd.to_csv('teraform_data/tera_spyd_idMerge.csv', index=False)
# idMerge_sog.to_csv('teraform_data/tera_sog_idMerge.csv', index=False)
```

In [None]:
# def extract_color(line):
#     pattern = re.compile("Color\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):

#         match = re.findall(pattern,str(line))[0]

#     else:

#         match = 'NA'
        
#     return match

In [None]:
# df3['color'] = df3.ItemSpecifics.apply(extract_color)

In [None]:
# def extract_blade_type(line):
#     pattern = re.compile("Blade Type\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):

#         match = re.findall(pattern,str(line))[0]

#     else:

#         match = 'NA'
        
#     return match

In [None]:
# df3['blade_type'] = df3.ItemSpecifics.apply(extract_blade_type)

In [None]:
# df3['blade_type'].value_counts()[:20]

In [None]:
# def extract_manufacture_region(line):
#     pattern = re.compile("Country/Region of Manufacture\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):
#         match = re.findall(pattern,str(line))[0]
#     else:
#         match = 'NA'
#     return match
        


In [None]:
# df3['region_of_Manufacture'] = df3.ItemSpecifics.apply(extract_manufacture_region)

In [None]:
# df3['region_of_Manufacture'].value_counts()

In [None]:
# def extract_handle_material(line):
#     pattern = re.compile("Handle Material\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):
#         match = re.findall(pattern,str(line))[0]
#     else:
#         match = 'NA'
#     return match
        


In [None]:
# df3['handle_material'] = df3.ItemSpecifics.apply(extract_handle_material)

In [None]:
# df3['handle_material'].value_counts()[:50]

In [None]:
# def extract_lock_type(line):
#     pattern = re.compile("Lock Type\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):
#         match = re.findall(pattern,str(line))[0]
#     else:
#         match = 'NA'
#     return match

# df3['lock_type'] = df3.ItemSpecifics.apply(extract_lock_type)

# df3['lock_type'].value_counts()

In [None]:
# def extract_blade_edge(line):
#     pattern = re.compile("Blade Edge\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):
#         match = re.findall(pattern,str(line))[0]
#     else:
#         match = 'NA'
#     return match
        
# df3['blade_edge'] = df3.ItemSpecifics.apply(extract_blade_edge)

# df3['blade_edge'].value_counts()

In [None]:
# def extract_dexterity(line):
#     pattern = re.compile("Dexterity\s*\S+\S+\s*\S+\S+\s\S(\w+)")
#     if re.findall(pattern,str(line)):
#         match = re.findall(pattern,str(line))[0]
#     else:
#         match = 'NA'
#     return match
        
# df3['dexterity'] = df3.ItemSpecifics.apply(extract_dexterity)

# df3['dexterity'].value_counts()

In [None]:
# df3.to_csv('data/item_specifics_df.csv')

In [None]:
# root='C:/Users/12108/Documents/GitHub/Neural_Network_Predicting_Reseller_Success_Ebay/nn_images2/'

In [None]:
# # import pandas as pd
# import matplotlib.pyplot  as plt
# from PIL import Image
# from pathlib import Path
# # import imagesize
# import numpy as np

# # Get the Image Resolutions
# imgs = [img.name for img in Path(root).iterdir() if img.suffix == ".jpg"]
# img_meta = {}
# for f in imgs: img_meta[str(f)] = imagesize.get(root+f)

# # Convert it to Dataframe and compute aspect ratio
# img_meta_df = pd.DataFrame.from_dict([img_meta]).T.reset_index().set_axis(['FileName', 'Size'], axis='columns', inplace=False)
# img_meta_df[["Width", "Height"]] = pd.DataFrame(img_meta_df["Size"].tolist(), index=img_meta_df.index)
# img_meta_df["Aspect Ratio"] = round(img_meta_df["Width"] / img_meta_df["Height"], 2)

# print(f'Total Nr of Images in the dataset: {len(img_meta_df)}')
# img_meta_df.head()

In [None]:
# # Visualize Image Resolutions

# fig = plt.figure(figsize=(8, 8))
# ax = fig.add_subplot(111)
# points = ax.scatter(img_meta_df.Width, img_meta_df.Height, color='blue', alpha=0.5, picker=True)
# ax.set_title("Image Resolution")
# ax.set_xlabel("Width", size=14)
# ax.set_ylabel("Height", size=14)

In [None]:
# # Visualize Image Resolutions

# fig = plt.figure(figsize=(8, 8))
# ax = fig.add_subplot(111)
# points = ax.scatter(img_meta_df.Width, img_meta_df.Height, color='blue', alpha=0.5, s=img_meta_df["Aspect Ratio"]*100, picker=True)
# ax.set_title("Image Resolution")
# ax.set_xlabel("Width", size=14)
# ax.set_ylabel("Height", size=14)

```
#Create row for converted Price of Knives in US dollars
price_list = []
for row in full_dataset:
    listed_price = np.float(row['sellingStatus']['convertedCurrentPrice']['value'])
    price_list.append(listed_price)
    
df['price_in_US'] = price_list
```

```
#atttempt to pull shipping cost from json dict
shipping_cost_list = []
for row in full_dataset:
    shipping_cost = np.float(row['shippingInfo']['shippingServiceCost']['value'])
    shipping_cost_list.append(shipping_cost)
    
df['shipping_price'] = shipping_cost_list
```

```
#pull shipping cost from json dict with regex 
df['shipping_cost'] = df['shippingInfo'].apply(lambda x: re.findall("(\d+\S+\d)", json.dumps(x)))
df['shipping_cost'] = df['shipping_cost'].apply(lambda x: ''.join(x))
df.drop(df[df['shipping_cost'] == ''].index, inplace=True)
df['shipping_cost'] = df['shipping_cost'].apply(lambda x: np.float(x))

#create new feature 'converted price'
df['converted_price'] = df['shipping_cost'] + df['price_in_US']
df = df.drop_duplicates(subset=['title', 'galleryURL'], keep='first')
display(df.head())
display(df.info())

df.to_csv('data/full_dataset.csv', index=False)
```

## Data Preparation

Describe and justify the process for preparing the data for analysis.

***
Questions to consider:
* Were there variables you dropped or created?
* How did you address missing values or outliers?
* Why are these choices appropriate given the data and the business problem?
***


# here you run your code to clean the data

```
import code.data_cleaning as dc

full_dataset = dc.full_clean()
```

## Data Modeling
Describe and justify the process for analyzing or modeling the data.

***
Questions to consider:
* How did you analyze or model the data?
* How did you iterate on your initial approach to make it better?
* Why are these choices appropriate given the data and the business problem?
***
# here you run your code to model the data


## Evaluation
Evaluate how well your work solves the stated business problem.

***
Questions to consider:
* How do you interpret the results?
* How well does your model fit your data? How much better is this than your baseline model?
* How confident are you that your results would generalize beyond the data you have?
* How confident are you that this model would benefit the business if put into use?
***


## Conclusions
Provide your conclusions about the work you've done, including any limitations or next steps.

***
Questions to consider:
* What would you recommend the business do as a result of this work?
* What are some reasons why your analysis might not fully solve the business problem?
* What else could you do in the future to improve this project?
***



In [None]:
# data_knife_dir = 'knife_images'
# data_profit_dir = 'data/profit'
# new_dir = 'split'

In [None]:
# os.mkdir(new_dir)

In [None]:
# train_folder = os.path.join(new_dir, 'train')
# train_profit = os.path.join(train_folder, 'profit')
# os.mkdir(train_folder)
# os.mkdir(train_profit)

# test_folder = os.path.join(new_dir, 'test')
# test_profit = os.path.join(test_folder, 'profit')
# os.mkdir(test_folder)
# os.mkdir(test_profit)


# val_folder = os.path.join(new_dir, 'validation')
# val_profit = os.path.join(val_folder, 'profit')
# os.mkdir(val_folder)
# os.mkdir(val_profit)

In [None]:
# val_profit

In [None]:
# # train knife regression images
# #80% of data
# imgs = knife_images[:5620]
# for img in imgs:
#     origin = os.path.join(data_knife_dir, img)
#     destination = os.path.join(train_profit, img)
#     shutil.copyfile(origin, destination)
    
# # test knife regression images
# #10% of data
# imgs = knife_images[5620:6322]
# for img in imgs:
#     origin = os.path.join(data_knife_dir, img)
#     destination = os.path.join(test_profit, img)
#     shutil.copyfile(origin, destination)
    
    
# # validation knife regression images
# #10% of data
# imgs = knife_images[6322:]
# for img in imgs:
#     origin = os.path.join(data_knife_dir, img)
#     destination = os.path.join(val, img)
#     shutil.copyfile(origin, destination)

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
from keras.preprocessing import image
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Conv2D, Dense, Flatten, GlobalMaxPooling2D, MaxPooling2D, BatchNormalization

img_array = cv2.imread('knife_images/918.jpg')  # convert to array

img_rgb = cv2.resize(img_array,(256,256),3)
plt.imshow(img_rgb)  # graph it
plt.show();


In [None]:
def image_checker(index,):
    img_array = cv2.imread('knife_images/'+str(index)+'.jpg')  
    img_rgb = cv2.resize(img_array,(256,256),3)
    plt.imshow(img_rgb)  # graph it
    plt.show();

In [None]:
top_benchmade_index[:50]

In [None]:
image_checker(6158)

In [None]:
image_checker(2286)

In [None]:
image_checker(1879)

In [None]:
image_checker(4326)

In [None]:
image_checker(6094)

In [None]:
#final processing steps for images

image_list = []
for x in range(len(df_CNN_regression)):
    
    img_array = cv2.imread('knife_images/'+str(x)+'.jpg')  # convert to array
    img_rgb = cv2.resize(img_array,(256,256),3)  # resize
    img_rgb = np.array(img_rgb).astype(np.float64)/255.0  # scaling
    image_list.append(img_rgb)
   
    # img_rgb = np.expand_dims(img_rgb, axis=0)  # expand dimension



In [None]:
df_CNN_regression['mean_profit']= (df_CNN_regression['profit']/df_CNN_regression['profit'].mean())

In [None]:
df_CNN_regression['mean_profit'].describe()

In [None]:
X = np.array(image_list)

In [None]:
y=  df_CNN_regression['mean_profit']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.4, random_state=32)# Create the Test and Final Training Datasets

In [None]:
X.shape

In [None]:
y.shape

In [None]:
print("Xtrain:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print("Xtrain:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)
print("X_val:", X_test.shape)
print("y_val:", y_test.shape)


In [None]:
#small batch

# model = models.Sequential()

# model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu',
#                         input_shape=(256 ,256, 3)))
# model.add(layers.BatchNormalization())

# model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())

# model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Flatten())


# model.add(Dense(256, activation='relu'))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(1, activation='linear'))

# model.compile(loss='mean_squared_error',
#               optimizer='Adam',
#                metrics=['mse'])

# history = model.fit(X_train,
#                     y_train,
#                     epochs=30,
#                     batch_size=32,
#                     validation_data=(X_val, y_val))



In [None]:
results_test = model.evaluate(X_test, y_test)

#model.summary()


In [None]:
df_scrub['profit'].mean()

In [None]:
2.2164 * 41.374303202846974

In [None]:
df_scrub.head()

In [None]:
model.summary()

In [None]:
#The model learned patterns wells until epoch 20
#after that the loss spikes signifcantly before dropping again
fig = plt.figure(figsize=(12,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot
plt.title('model loss')
plt.ylabel('loss( mean square error)')
plt.xlabel('epoch')
plt.legend(['train_mse', 'val_mse'], loc='upper right')
plt.show();

In [None]:
model.save('my_model_batch32.h5')

In [None]:
#a train set of 60% and a val and test size of 20% each 

In [None]:
#this model showed a lot of indication that it was overfit
#need to retry how I split the data 
#Instead of manul indexing, will use 
# from sklearn model_selection train_test_split



# X_train = X[:4918]
# y_train = y[:4918]

# X_train = X[4918:5971]
# y_train = y[4918:5971]

# X_test = X[5971:]
# y_test = y[5971:]


# display(len(X_val)/len(X))
# display(len(X_train)/len(X))
# len(X_test)/len(X)



# model = models.Sequential()

# model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu',
#                         input_shape=(224 ,224,  3)))
# model.add(layers.BatchNormalization())

# model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())

# model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Flatten())

# model.add(Dense(512, activation='relu'))
# model.add(Dropout(0.1))

# model.add(Dense(256, activation='relu'))
# model.add(Dense(128, activation='relu'))

# model.add(Dense(1, activation='linear'))

# model.compile(loss='mean_squared_error',
#               optimizer='Adam',
#                metrics=['mse'])
# history = model.fit(X_train,
#                     y_train,
#                     epochs=32,
#                     batch_size=300,
#                     validation_data=(X_val, y_val))




# results_train = model.evaluate(X_test, y_test)

#model.summary()


# model.save('my_model_batch500.h5')

In [None]:
history.history.keys()

In [None]:

fig = plt.figure(figsize=(12,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot
plt.title('model loss')
plt.ylabel('loss( mean square error)')
plt.xlabel('epoch')
plt.legend(['train_mse', 'val_mse'], loc='upper right')
plt.show();

In [None]:
X_train.shape

In [None]:
# results_train = model.evaluate(X_test, y_test)

#model.summary()


# model.save('my_model_batch500.h5')

In [None]:



# ## Data Preparation

# Describe and justify the process for preparing the data for analysis.

# ***
# Questions to consider:
# * Were there variables you dropped or created?
# * How did you address missing values or outliers?
# * Why are these choices appropriate given the data and the business problem?
# ***


# # here you run your code to clean the data

# ```
# import code.data_cleaning as dc

# full_dataset = dc.full_clean()
# ```

# ## Data Modeling
# Describe and justify the process for analyzing or modeling the data.

# ***
# Questions to consider:
# * How did you analyze or model the data?
# * How did you iterate on your initial approach to make it better?
# * Why are these choices appropriate given the data and the business problem?
# ***
# # here you run your code to model the data


# ## Evaluation
# Evaluate how well your work solves the stated business problem.

# ***
# Questions to consider:
# * How do you interpret the results?
# * How well does your model fit your data? How much better is this than your baseline model?
# * How confident are you that your results would generalize beyond the data you have?
# * How confident are you that this model would benefit the business if put into use?
# ***


# ## Conclusions
# Provide your conclusions about the work you've done, including any limitations or next steps.

# ***
# Questions to consider:
# * What would you recommend the business do as a result of this work?
# * What are some reasons why your analysis might not fully solve the business problem?
# * What else could you do in the future to improve this project?
# ***



# # data_knife_dir = 'knife_images'
# # data_profit_dir = 'data/profit'
# # new_dir = 'split'

# # os.mkdir(new_dir)

# # train_folder = os.path.join(new_dir, 'train')
# # train_profit = os.path.join(train_folder, 'profit')
# # os.mkdir(train_folder)
# # os.mkdir(train_profit)

# # test_folder = os.path.join(new_dir, 'test')
# # test_profit = os.path.join(test_folder, 'profit')
# # os.mkdir(test_folder)
# # os.mkdir(test_profit)


# # val_folder = os.path.join(new_dir, 'validation')
# # val_profit = os.path.join(val_folder, 'profit')
# # os.mkdir(val_folder)
# # os.mkdir(val_profit)

# # val_profit

# # # train knife regression images
# # #80% of data
# # imgs = knife_images[:5620]
# # for img in imgs:
# #     origin = os.path.join(data_knife_dir, img)
# #     destination = os.path.join(train_profit, img)
# #     shutil.copyfile(origin, destination)
    
# # # test knife regression images
# # #10% of data
# # imgs = knife_images[5620:6322]
# # for img in imgs:
# #     origin = os.path.join(data_knife_dir, img)
# #     destination = os.path.join(test_profit, img)
# #     shutil.copyfile(origin, destination)
    
    
# # # validation knife regression images
# # #10% of data
# # imgs = knife_images[6322:]
# # for img in imgs:
# #     origin = os.path.join(data_knife_dir, img)
# #     destination = os.path.join(val, img)
# #     shutil.copyfile(origin, destination)

# import tensorflow as tf
# from tensorflow import keras
# from keras.models import load_model
# from keras.preprocessing import image
# import numpy as np
# import cv2
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dropout, Conv2D, Dense, Flatten, GlobalMaxPooling2D, MaxPooling2D, BatchNormalization

# img_array = cv2.imread('knife_images/918.jpg')  # convert to array

# img_rgb = cv2.resize(img_array,(256,256),3)
# plt.imshow(img_rgb)  # graph it
# plt.show();


# def image_checker(index,):
#     img_array = cv2.imread('knife_images/'+str(index)+'.jpg')  
#     img_rgb = cv2.resize(img_array,(256,256),3)
#     plt.imshow(img_rgb)  # graph it
#     plt.show();

# top_benchmade_index[:50]

# image_checker(6158)

# image_checker(2286)

# image_checker(1879)

# image_checker(4326)

# image_checker(6094)

# #final processing steps for images

# image_list = []
# for x in range(len(df_CNN_regression)):
    
#     img_array = cv2.imread('knife_images/'+str(x)+'.jpg')  # convert to array
#     img_rgb = cv2.resize(img_array,(256,256),3)  # resize
#     img_rgb = np.array(img_rgb).astype(np.float64)/255.0  # scaling
#     image_list.append(img_rgb)
   
#     # img_rgb = np.expand_dims(img_rgb, axis=0)  # expand dimension



# df_CNN_regression['mean_profit']= (df_CNN_regression['profit']/df_CNN_regression['profit'].mean())

# df_CNN_regression['mean_profit'].describe()

# X = np.array(image_list)

# y=  df_CNN_regression['mean_profit']

# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.4, random_state=32)# Create the Test and Final Training Datasets

# X.shape

# y.shape

# print("Xtrain:", X_train.shape)
# print("y_train:", y_train.shape)
# print("X_test:", X_test.shape)
# print("y_test:", y_test.shape)

# X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# print("Xtrain:", X_train.shape)
# print("y_train:", y_train.shape)
# print("X_test:", X_test.shape)
# print("y_test:", y_test.shape)
# print("X_val:", X_test.shape)
# print("y_val:", y_test.shape)




# #small batch

# # model = models.Sequential()

# # model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu',
# #                         input_shape=(256 ,256, 3)))
# # model.add(layers.BatchNormalization())

# # model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())

# # model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Flatten())


# # model.add(Dense(256, activation='relu'))
# # model.add(Dense(128, activation='relu'))
# # model.add(Dense(1, activation='linear'))

# # model.compile(loss='mean_squared_error',
# #               optimizer='Adam',
# #                metrics=['mse'])

# # history = model.fit(X_train,
# #                     y_train,
# #                     epochs=30,
# #                     batch_size=32,
# #                     validation_data=(X_val, y_val))



# results_test = model.evaluate(X_test, y_test)

# #model.summary()


# df_scrub['profit'].mean()

# 2.2164 * 41.374303202846974

# df_scrub.head()



# model.summary()

# #The model learned patterns wells until epoch 20
# #after that the loss spikes signifcantly before dropping again
# fig = plt.figure(figsize=(12,8))
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.plot
# plt.title('model loss')
# plt.ylabel('loss( mean square error)')
# plt.xlabel('epoch')
# plt.legend(['train_mse', 'val_mse'], loc='upper right')
# plt.show();

# model.save('my_model_batch32.h5')





# #a train set of 60% and a val and test size of 20% each 

# #this model showed a lot of indication that it was overfit
# #need to retry how I split the data 
# #Instead of manul indexing, will use 
# # from sklearn model_selection train_test_split



# # X_train = X[:4918]
# # y_train = y[:4918]

# # X_train = X[4918:5971]
# # y_train = y[4918:5971]

# # X_test = X[5971:]
# # y_test = y[5971:]


# # display(len(X_val)/len(X))
# # display(len(X_train)/len(X))
# # len(X_test)/len(X)



# # model = models.Sequential()

# # model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu',
# #                         input_shape=(224 ,224,  3)))
# # model.add(layers.BatchNormalization())

# # model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())

# # model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
# # model.add(layers.BatchNormalization())
# # model.add(layers.MaxPooling2D((2, 2)))

# # model.add(layers.Flatten())

# # model.add(Dense(512, activation='relu'))
# # model.add(Dropout(0.1))

# # model.add(Dense(256, activation='relu'))
# # model.add(Dense(128, activation='relu'))

# # model.add(Dense(1, activation='linear'))

# # model.compile(loss='mean_squared_error',
# #               optimizer='Adam',
# #                metrics=['mse'])
# # history = model.fit(X_train,
# #                     y_train,
# #                     epochs=32,
# #                     batch_size=300,
# #                     validation_data=(X_val, y_val))




# # results_train = model.evaluate(X_test, y_test)

# #model.summary()


# # model.save('my_model_batch500.h5')

# results_train = model.evaluate(X_test, y_test)

# model.summary()

# # model.save('my_model_batch500.h5')

# history.history.keys()

# #The model is showing a lot of signs of overfitting 
# fig = plt.figure(figsize=(12,8))
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.plot
# plt.title('model loss')
# plt.ylabel('loss( mean square error)')
# plt.xlabel('epoch')
# plt.legend(['train_mse', 'val_mse'], loc='upper right')
# plt.show();

# X_train.shape

# # results_train = model.evaluate(X_test, y_test)

# #model.summary()


# # model.save('my_model_batch500.h5')

