## Modeling

In [5]:
from sklearn.model_selection import train_test_split
import os
from collections import Counter

import pandas as pd 
import  json
import requests
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, Embedding, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling2D
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from keras import models
from keras import layers
import tensorflow as tf

from keras_preprocessing.image import ImageDataGenerator

In [6]:
def apply_iqr_filter(df):
    
    price_Q1 = df['converted_price'].quantile(0.25)
    price_Q3 = df['converted_price'].quantile(0.75)
    price_iqr = price_Q3 - price_Q1

    profit_Q1 = df['profit'].quantile(0.25)
    profit_Q3 = df['profit'].quantile(0.75)
    profit_iqr = profit_Q3 - profit_Q1

    ROI_Q1 = df['ROI'].quantile(0.25)
    ROI_Q3 = df['ROI'].quantile(0.75)
    ROI_iqr = ROI_Q3 - ROI_Q1

    price_upper_limit = price_Q3 + (1.5 * price_iqr)
    price_lower_limit = price_Q1 - (1.5 * price_iqr)

    profit_upper_limit = profit_Q3 + (1.5 * profit_iqr)
    profit_lower_limit = profit_Q1 - (1.5 * profit_iqr)

    ROI_upper_limit = ROI_Q3 + (1.5 * ROI_iqr)
    ROI_lower_limit = ROI_Q1 - (1.5 * ROI_iqr)
    
#     print(f'Brand: {df.brand[0]}')
#     print(f'price upper limit: ${np.round(price_upper_limit,2)}')
#     print(f'price lower limit: ${np.round(price_lower_limit,2)}')
#     print('-----------------------------------')
#     print(f'profit upper limit: ${np.round(profit_upper_limit,2)}')
#     print(f'profit lower limit: ${np.round(profit_lower_limit,2)}')
#     print('-----------------------------------')
#     print(f'ROI upper limit: {np.round(ROI_upper_limit,2)}%')
#     print(f'ROI lower limit: {np.round(ROI_lower_limit,2)}%')
#     print('-----------------------------------')

    
    new_df = df[(df['converted_price'] <= price_upper_limit) &
                (df['converted_price'] >= price_lower_limit) &
                (df['profit'] <= profit_upper_limit) &
                (df['ROI'] <= ROI_upper_limit) &
                (df['profit'] <= profit_upper_limit) &
                (df['ROI'] >= ROI_lower_limit)]
    
    return new_df
#download jpg urls from dataFrame
def download(row):
    filename = os.path.join(root_folder, str(row.name) + im_extension)

# create folder if it doesn't exist
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    url = row.Image
#     print(f"Downloading {url} to {filename}")
    
    try:
        r = requests.get(url, allow_redirects=True)
        with open(filename, 'wb') as f:
            f.write(r.content)
    except:
        print(f'{filename} error')


def cardinality_threshold(column,threshold=0.75,return_categories_list=True):
    #calculate the threshold value using
    #the frequency of instances in column
    threshold_value=int(threshold*len(column))
    #initialize a new list for lower cardinality column
    categories_list=[]
    #initialize a variable to calculate sum of frequencies
    s=0
    #Create a dictionary (unique_category: frequency)
    counts=Counter(column)

    #Iterate through category names and corresponding frequencies after sorting the categories
    #by descending order of frequency
    for i,j in counts.most_common():
        #Add the frequency to the total sum
        s += dict(counts)[i]
        #append the category name to the categories list
        categories_list.append(i)
        #Check if the global sum has reached the threshold value, if so break the loop
        if s >= threshold_value:
            break
      #append the new 'Other' category to list
    categories_list.append('Other')

    #Take all instances not in categories below threshold  
    #that were kept and lump them into the
    #new 'Other' category.
    new_column = column.apply(lambda x: x if x in categories_list else 'Other')

    #Return the transformed column and
    #unique categories if return_categories = True
    if(return_categories_list):
        return new_column,categories_list
    #Return only the transformed column if return_categories=False
    else:
        return new_column
    
    
    
def year_wrangler(row):
    if row['Year'] >= 1960 and row['Year'] < 1970:
        return '60s'
    elif row['Year'] >= 1970 and row['Year'] < 1980:
        return '70s'
    elif row['Year'] >= 1980 and row['Year'] < 1990:
        return '80s'
    elif row['Year'] >= 1990 and row['Year'] < 2000:
        return '90s'
    elif row['Year'] >= 2000 and row['Year'] < 2010:
        return '00s'
    elif row['Year'] >= 2010 and row['Year'] < 2020:
        return '10s'
    elif row['Year'] >= 2020 and row['Year'] < 2030:
        return '20s'
    
    else:
        return 'unknown'
    
def bin_knife_year(knife_df):
    knife_df['construction_year'] = knife_df.apply(lambda row: year_wrangler(row), axis=1)
    return knife_df

In [None]:
# def drop_cols(water_pump_df):
#     to_drop_final = ['id', 'recorded_by', 'num_private',
#           'waterpoint_type_group', 'source',
#           'source_class', 'extraction_type',
#           'extraction_type_group', 'payment_type',
#           'management_group', 'scheme_name',
#           'water_quality', 'quantity_group',
#           'scheme_management', 'longitude',
#           'latitude', 'date_recorded',
#           'amount_tsh', 'gps_height',
#           'region_code', 'district_code']
#           #'population'
    
#     return water_pump_df.drop(columns=to_drop_final, axis=1)

#helper function to bin construction year


# def apply_cardinality_reduct(water_pump_df, reduct_dict):
#     for col, categories_list in reduct_dict.items():
#         water_pump_df[col] = water_pump_df[col].apply(lambda x: x if x in categories_list else 'Other')
#     return water_pump_df
        


# #one_hot_incode categorical data
# def one_hot(water_pump_df):
#     final_cat = ['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
#        'lga', 'ward', 'public_meeting', 'permit', 'construction_year',
#        'extraction_type_class', 'management', 'payment', 'quality_group',
#        'quantity', 'source_type', 'waterpoint_type']
    
#     water_pump_df = pd.get_dummies(water_pump_df[final_cat], drop_first=True)
    
#     return water_pump_df

    
    
# #master function for cleaning dataFrame
# def clean_dataFrame(water_pump_df, reduct_dict):
#     water_pump_df = drop_cols(water_pump_df)
#     water_pump_df = bin_construction_year(water_pump_df)
#     water_pump_df = fill_unknowns(water_pump_df)
#     water_pump_df = fill_col_normal_data(water_pump_df)
#     water_pump_df = apply_cardinality_reduct(water_pump_df, reduct_dict)
#     water_pump_df = one_hot(water_pump_df)
    
#     return water_pump_df

# ###############################################
# # The rest of the functions in this section
# #define functions that reduce cardinality
# #by mapping infrequent values ot other
# #the dictionary derived from these functions
# #will be used by my_funk in my master 
# #clean_dataFrame function

# #helper function for reducing cardinality    
# def cardinality_threshold(column,threshold=0.65):
#     #calculate the threshold value using
#     #the frequency of instances in column
#     threshold_value=int(threshold*len(column))
#     #initialize a new list for lower cardinality column
#     categories_list=[]
#     #initialize a variable to calculate sum of frequencies
#     s=0
#     #Create a dictionary (unique_category: frequency)
#     counts=Counter(column)

#     #Iterate through category names and corresponding frequencies after sorting the categories
#     #by descending order of frequency
#     for i,j in counts.most_common():
#         #Add the frequency to the total sum
#         s += dict(counts)[i]
#         #append the category name to the categories list
#         categories_list.append(i)
#         #Check if the global sum has reached the threshold value, if so break the loop
#         if s >= threshold_value:
#             break
#         #append the new 'Other' category to list
#         categories_list.append('Other')

#     #Take all instances not in categories below threshold  
#     #that were kept and lump them into the
#     #new 'Other' category.
#     new_column = column.apply(lambda x: x if x in categories_list else 'Other')
# #     return new_column
#     return categories_list
     
#  #reduces the cardinality of appropriate categories   
# def get_col_val_mapping(water_pump_df):
#     col_threshold_list = [
#         ('funder',0.65), 
#         ('installer', 0.65),
#         ('wpt_name', 0.15),
#         ('subvillage', 0.07),
#         ('lga', 0.6),
#         ('ward', 0.05)
#     ]
    
#     reduct_dict = {}
    
#     for col, thresh in col_threshold_list:
#         reduct_dict[col] = cardinality_threshold(water_pump_df[col],
#                                                    threshold= thresh)
        
#     return reduct_dict

# reduct_dict is a key value mapper that will
# be used for both training and testing sets
# in order to reduce cardinality of the data

In [7]:
listed_df = pd.read_csv('listed_data/listed_knives_df.csv', 
                        dtype={'UPC': str, 
                               'Year': str,
                               'MPN': str})

In [None]:
# df_listed.drop(['shipping_cost', 
#                 'price_in_US','cost',
#                 'Original/Reproduction', 
#                 'specBrand', 'Type'],
#                axis=1,inplace=True)



# str_columns = ['Location', 'Country', 'Model',
#                'Country/Region of Manufacture', 
#                'Blade Material', 'Blade Type',
#                'Blade Edge', 'Dexterity', 
#                'Color', 'Number of Blades',
#                'Opening Mechanism', 'Handle Material', 
#                'Lock Type', 'Blade Range']

# for col in str_columns:
#     df_listed[col] = df_listed[col].str.lower()
    
# pattern = ".*,\s*([^\d,]+?)(?:\s*\d+)?$"
# df_listed['State_or_Province'] = df_listed['Location'].str.extract(pattern)

# pattern = re.compile("(\d{4}$)")
# df_listed['Year'] = df_listed['Year'].str.extract(pattern)

# df_listed['Year'] = df_listed['Year'].fillna(0)

# df_listed['Year'] = df_listed['Year'].astype(int)

# df_listed = bin_knife_year(df_listed)

In [12]:
sold_df = pd.read_csv('terapeak_data/sold_df.csv')

In [13]:
sold_df.brand.value_counts()

case          17337
buck          11384
kershaw       10766
victorinox     9486
spyderco       6165
benchmade      5792
crkt           4292
sog            3018
Name: brand, dtype: int64

In [14]:
sold_df.drop(['price_in_US', 
              'shipping_cost'],
             axis=1, inplace=True)

In [15]:
df_sold = apply_iqr_filter(sold_df).copy()

In [16]:
used_listed = listed_df.loc[listed_df['condition'] != 1000]

In [17]:
used_listed.reset_index(drop=True,inplace=True)

In [18]:
cols = ['title','pictureURLLarge','converted_price','brand','profit','ROI']
used_listed2 = used_listed[cols].copy()
df1 = pd.concat([sold_df, used_listed2]).copy()
df1['Image'].fillna(df1['pictureURLLarge'], inplace=True)

In [19]:
df = apply_iqr_filter(df1).copy()

### Neural network with "title" column as input

In [20]:
df_title = df.drop(['Image', 'url', 
                    'date_sold', 'profit',
                    'ROI', 'brand', 'cost'],
                     axis=1).copy()

In [21]:
df_title.rename({'title': 'data',
                 'converted_price': 'labels'},
                axis=1, inplace=True)

In [22]:
mean_price = df_title['labels'].mean()
mean_price

49.62917081729681

In [23]:
df_title['labels'] = (df_title['labels']/mean_price)
Y = df_title['labels'].values

In [24]:
df_train, df_test, Ytrain, Ytest = train_test_split(df_title['data'], Y, test_size=0.2)

In [25]:
# Convert sentences to sequences
MAX_VOCAB_SIZE = 40000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df_train)
sequences_train = tokenizer.texts_to_sequences(df_train)
sequences_test = tokenizer.texts_to_sequences(df_test)


In [26]:
# get word -> integer mapping
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)

Found 30734 unique tokens.


In [27]:
# pad sequences so that we get a N x T matrix
data_train = pad_sequences(sequences_train)
print('Shape of data train tensor:', data_train.shape)

# get sequence length
T = data_train.shape[1]

Shape of data train tensor: (56537, 43)


In [28]:
data_test = pad_sequences(sequences_test, maxlen=T)
print('Shape of data test tensor:', data_test.shape)

Shape of data test tensor: (14135, 43)


In [29]:
# Create the RNN model

# We get to choose embedding dimensionality
D = 20

# Hidden state dimensionality
M = 15


i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='linear')(x)

model = Model(i, x)

In [30]:
# Compile and fit
model.compile(
  loss='MSE',
  optimizer='adam',
  metrics=['mae']
)


print('Training model...')
r = model.fit(
  data_train,
  Ytrain,
  epochs=5,
  validation_data=(data_test, Ytest)
)

Training model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 43)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 43, 20)            614700    
_________________________________________________________________
lstm (LSTM)                  (None, 43, 15)            2160      
_________________________________________________________________
global_max_pooling1d (Global (None, 15)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 16        
Total params: 616,876
Trainable params: 616,876
Non-trainable params: 0
_________________________________________________________________


In [None]:
fig = plt.subplots(figsize=(12,8))
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.title("Loss vs val Loss for RNN model on titles (MSE)", fontsize=15)
plt.xlabel("epochs", fontsize=15)
plt.ylabel("loss (mean squared error)", fontsize=15)
plt.legend();
plt.savefig('images/RNN_titles_MSE1.png')

In [None]:
fig = plt.subplots(figsize=(12,8))
plt.plot(r.history['mae'], label='mae')
plt.plot(r.history['val_mae'], label='val_mae')
plt.title("Loss vs val Loss for RNN model on titles (MAE)", fontsize=15)
plt.xlabel("epochs", fontsize=15)
plt.ylabel("loss (mean absolute error)", fontsize=15)
plt.legend();
plt.savefig('images/RNN_titles_MAE1.png')

In [None]:
0.276 * mean_price

In [None]:
# Create the CNN model

# We get to choose embedding dimensionality
D = 20



i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = Conv1D(32, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='linear')(x)

model = Model(i, x)

In [None]:
# Compile and fit
model.compile(
  loss='MSE',
  optimizer='adam',
  metrics=['MSE']
)


print('Training model...')
r = model.fit(
  data_train,
  Ytrain,
  epochs=5,
  validation_data=(data_test, Ytest)
)

In [None]:
# Plot loss per iteration
import matplotlib.pyplot as plt
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend();

In [None]:
# Plot accuracy per iteration
plt.plot(r.history['MSE'], label='MSE')
plt.plot(r.history['val_MSE'], label='val_MSE')
plt.legend();

### CNN using images as input

In [None]:
df_imgs = df.drop(['title', 'url', 
                   'date_sold', 'profit',
                   'ROI', 'brand', 'cost',
                   'pictureURLLarge'],
                     axis=1).copy()

In [None]:
df_imgs.dropna(subset=['Image'], inplace=True)

In [None]:
df_imgs.reset_index(drop=True, inplace=True)

In [None]:
df_imgs['file_index'] = df_imgs.index.values
df_imgs['file_index'] = df_imgs['file_index'].astype(str)

In [None]:
df_imgs['filename'] = df_imgs['file_index'] + '.jpg'

In [None]:
# Identify Image Resolutions

# # Import Packages
# import pandas as pd
# import matplotlib.pyplot  as plt
# from PIL import Image
# from pathlib import Path
# import imagesize
# import numpy as np

# # Get the Image Resolutions
# imgs = [img.name for img in Path(root).iterdir() if img.suffix == ".jpg"]
# img_meta = {}
# for f in imgs: img_meta[str(f)] = imagesize.get(root+f)

# # Convert it to Dataframe and compute aspect ratio
# img_meta_df = pd.DataFrame.from_dict([img_meta]).T.reset_index().set_axis(['FileName', 'Size'], axis='columns', inplace=False)
# img_meta_df[["Width", "Height"]] = pd.DataFrame(img_meta_df["Size"].tolist(), index=img_meta_df.index)
# img_meta_df["Aspect Ratio"] = round(img_meta_df["Width"] / img_meta_df["Height"], 2)

# print(f'Total Nr of Images in the dataset: {len(img_meta_df)}')
# img_meta_df.head()



# # Visualize Image Resolutions

# fig = plt.figure(figsize=(8, 8))
# ax = fig.add_subplot(111)
# points = ax.scatter(img_meta_df.Width, img_meta_df.Height, color='blue', alpha=0.5, s=img_meta_df["Aspect Ratio"]*100, picker=True)
# ax.set_title("Image Resolution")
# ax.set_xlabel("Width", size=14)
# ax.set_ylabel("Height", size=14);

In [None]:
def download(row):
    filename = row.filepath

# create folder if it doesn't exist
#     os.makedirs(os.path.dirname(filename), exist_ok=True)

    url = row.Image
#     print(f"Downloading {url} to {filename}")
    
    try:
        r = requests.get(url, allow_redirects=True)
        with open(filename, 'wb') as f:
            f.write(r.content)
    except:
        print(f'{filename} error')

In [None]:
root_folder = 'C:/Users/12108/Documents/GitHub/Neural_Network_Predicting_Reseller_Success_Ebay/nn_images/'
df_imgs['filepath'] = root_folder + df_imgs['filename']

In [None]:
df_imgs['filepath'].sample(2).apply(print)

In [None]:
df_imgs.apply(download, axis=1)

In [None]:
removed_files = []
pathway = 'C:/Users/12108/Documents/GitHub/Neural_Network_Predicting_Reseller_Success_Ebay/nn_images/'
for filename in os.listdir(pathway):
    if filename.endswith('.jpg'):
        try:
            img = Image.open(pathway + filename)  # open the image file
            img.verify()  # verify that it is, in fact an image
        except (IOError, SyntaxError) as e:
            print(filename)
            removed_files.append(filename)
            os.remove(pathway + filename)

In [None]:
to_drop = df_imgs.loc[df_imgs['filename'].isin(removed_files)].index.to_list()

In [None]:
df_imgs.drop(to_drop, inplace=True)

In [None]:
 img_list = os.listdir('C:/Users/12108/Documents/GitHub/Neural_Network_Predicting_Reseller_Success_Ebay/nn_images/')

In [None]:
img_df = df_imgs.loc[df_imgs['filename'].isin(img_list)].copy()

In [None]:
img_df.reset_index(drop=True, inplace=True)

In [None]:
img_df.info()

In [None]:
img_df.rename({'Image': 'data',
               'converted_price': 'labels'},
                axis=1, inplace=True)

In [None]:
median_price = img_df['labels'].median()
median_price

In [None]:
img_df['labels'] = (img_df['labels']/median_price)

In [None]:
Y = img_df['labels'].values

In [None]:
df_train, df_test, Ytrain, Ytest = train_test_split(img_df, Y, test_size=0.20)

In [None]:
datagen=ImageDataGenerator(rescale=1./255.,validation_split=0.20)

In [None]:
train_generator=datagen.flow_from_dataframe(
dataframe=df_train,
directory= None,
x_col="filepath",
y_col="labels",
subset="training",
batch_size=100,
seed=55,
shuffle=True,
class_mode="raw")
    
valid_generator=datagen.flow_from_dataframe(
dataframe=df_train,
directory=None,
x_col="filepath",
y_col="labels",
subset="validation",
batch_size=100,
seed=55,
shuffle=True,
class_mode="raw")

test_datagen=ImageDataGenerator(rescale=1./255.)
test_generator=test_datagen.flow_from_dataframe(
dataframe=df_test,
directory=None,
x_col="filepath",
y_col="labels",
batch_size=100,
seed=55,
shuffle=False,
class_mode="raw")

In [None]:
model = models.Sequential()

model.add(layers.Conv2D(16, (3, 3), padding='same', activation='relu',
                        input_shape=(256 ,256,  3)))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(16, (3, 3), activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu',
                        input_shape=(256 ,256,  3)))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(loss='MSE',
              optimizer='Adam',
               metrics=['mae', 'mse'])

In [None]:
summary = model.fit(train_generator, epochs=3, validation_data=valid_generator)

In [None]:
model.evaluate(valid_generator)

In [None]:
test_generator.reset()
pred=model.predict(test_generator,verbose=1)

In [None]:
test_results = model.evaluate(test_generator)

In [None]:
fig = plt.figure(figsize=(12,8))
plt.plot(summary.history['loss'])
plt.plot(summary.history['val_loss'])
plt.plot
plt.title('model loss')
plt.ylabel('loss(mean absolute error)')
plt.xlabel('epoch')
plt.legend(['train_loss', 'val_loss'], loc='upper right')
plt.show();

In [None]:
# # define two sets of inputs
# inputA = Input(shape=(32,))
# inputB = Input(shape=(128,))
# # the first branch operates on the first input
# x = Dense(8, activation="relu")(inputA)
# x = Dense(4, activation="relu")(x)
# x = Model(inputs=inputA, outputs=x)
# # the second branch opreates on the second input
# y = Dense(64, activation="relu")(inputB)
# y = Dense(32, activation="relu")(y)
# y = Dense(4, activation="relu")(y)
# y = Model(inputs=inputB, outputs=y)
# # combine the output of the two branches
# combined = concatenate([x.output, y.output])
# # apply a FC layer and then a regression prediction on the
# # combined outputs
# z = Dense(2, activation="relu")(combined)
# z = Dense(1, activation="linear")(z)
# # our model will accept the inputs of the two branches and
# # then output a single value
# model = Model(inputs=[x.input, y.input], outputs=z)