# Guitar App Data Updater

<h3> This notebook will do the following:</h3>
<ul>
    <li>Import the exsisting master data from AWS S3 using boto 3</li>
    <li>Import the new data from webscrape from AWS S3 using boto 3 </li>
    <li>Clean and prepare new data for modeling using Pandas, RE, AST (Abstract Syntax Trees)</li>
    <li>Merge the old data with the new data and upload to AWS S3 using boto 3</li>
    <li>Build and save new model using new master data as pkl file and upload to AWS S3 using boto 3 </li>
</ul>

In [None]:
## import required files for connecting to AWS
import pandas as pd
import re
import ast
import configparser

import logging
import boto3
import boto3.session
from botocore.exceptions import ClientError
import os

import pickle



In [None]:
# Set AWS credentials from secret file
config = configparser.ConfigParser()
config.read('aws.ini')
    
AWS_key_id     = config['aws']['aws_access_key_id']
AWS_secret_key = config['aws']['aws_secret_access_key']   
    

# Creating the low level functional client
client = boto3.client(
    's3',
    aws_access_key_id = AWS_key_id,
    aws_secret_access_key =  AWS_secret_key,
    region_name = 'us-east-1'
)
    
# Creating the high level object oriented interface
resource = boto3.resource(
    's3',
    aws_access_key_id = AWS_key_id,
    aws_secret_access_key =  AWS_secret_key,
    region_name = 'us-east-1'
)

In [None]:
# sanity check, see if the bucket can be accessed
clientResponse = client.list_buckets()
    
# Print the bucket names one by one
print('Getting bucket name..')
for bucket in clientResponse['Buckets']:
    print(f'Bucket Name: {bucket["Name"]}')

Getting bucket name..
Bucket Name: dataforguitarapp


<b name='import'>Check if the existing model can be pulled from AWS</b>

In [None]:
# reading in pickle data 

response = client.get_object(Bucket='dataforguitarapp', Key='guitar.pkl')

body = response['Body'].read()
datamodel = pickle.loads(body)

# verify that the file is an LR model
type(datamodel)

sklearn.linear_model._base.LinearRegression

<b>Get the existing master data which will be merged with the newly scraped data. </b>

In [None]:

# Create the S3 object
obj = client.get_object(
    Bucket = 'dataforguitarapp',
    Key = 'full_data.csv'
)
    
# Read data from the S3 object
main_df = pd.read_csv(obj['Body'])

    
# inspect the data frame
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112938 entries, 0 to 112937
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      112938 non-null  int64 
 1   title           112938 non-null  object
 2   brand           112938 non-null  object
 3   condition       112938 non-null  object
 4   categories      112938 non-null  object
 5   price           112938 non-null  object
 6   pickups         112938 non-null  object
 7   type            112938 non-null  object
 8   model           112938 non-null  object
 9   finish          111638 non-null  object
 10  origin          112938 non-null  object
 11  year            112938 non-null  object
 12  top             112938 non-null  object
 13  handed          112938 non-null  object
 14  neck            112938 non-null  object
 15  product_group   112938 non-null  object
 16  body_type       112938 non-null  object
 17  body_material   112938 non-nu

<b>Get the newly webscraped data</b>

In [None]:
# Create the S3 object
obj2 = client.get_object(
    Bucket = 'dataforguitarapp',
    Key = 'update_data.csv'
)
    
# Read data from the S3 object
update_df = pd.read_csv(obj2['Body'])

    
# inspect the data frame
update_df.info()

<b>Clean the new data and prep for modeling</b>

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import ast
import re

from tqdm import tqdm
tqdm.pandas()



def clean_year(txt):
    txt = txt.replace("'59",'1959').replace("'60",'1960').replace("'61",'1961').replace("'62",'1962')
    txt = txt.replace("'70s",'1970s')\
            .replace("'60s",'1960s')\
            .replace("'80s",'1980s')\
            .replace("'90s",'1990s')\
            .replace("'00s",'2000s')\
            .replace("'00s",'2000s')
    return txt


def expand_data(df):
    df = df[df.more_info.notnull()] # make sure there are no NaN in the more_info column
    
    project_data = df.copy().reset_index()
    dropped_count = 0 # counter to report if any records were dropped 
    
    expanded_project_data = pd.DataFrame()
    
    for val in tqdm(project_data.more_info):
        #print(val.index)
        val = val.replace('"',"'")
        val = clean_year(val)
        
        #line = re.sub(r"'\d\d", "", val)
        try:
            val = ast.literal_eval(val)
            more_df = pd.DataFrame([val])    
        
            #result = pd.concat([sampledata, more_df], axis=1, join='inner')
            expanded_project_data = expanded_project_data.append(more_df)
        except:
            dropped_count += 1
            #print('dropped record')
    print(f'There were {dropped_count} records that could not  be processed')
    return expanded_project_data

def clean_price(txt):
    '''
    function to remove $ from price and replace commas then convert to float 
    '''        
    txt = txt.replace('$','')
    txt = txt.replace(',','')
    #txt = "{:.2f}".format(float(txt))
    #txt = float(txt)
    return float(txt)


def encode_price(df):
    conditions = [df.price < 500,
                  df.price < 1000,
                  df.price < 1500,
                  df.price < 2500,
                  df.price < 3500
                 ]
    
    values = [0,1,2,3,4]
    df['price_code'] = np.select(conditions, values, default=6)


# feature origin code
def clean_origin(df):    
    ## Correcting origin for American Made guitars
    
    df['origin'].loc[(df['model'].str.contains('American')) & (df['origin'] == 'Asia')] = 'United States'
    
    
    ## Correcting origin for Ibanez guitars
    df['origin'].loc[(df['origin']=='Asia') & 
                                     (df['model'].str.contains('AG')) &
                                     (df['brand']=='Ibanez')] = 'Japan'
    
    df['origin'].loc[(df['origin']=='Asia') & 
                                     (df['model'].str.contains('RG')) &
                                     (df['brand']=='Ibanez')] = 'Japan'
    
    df['origin'].loc[(df['origin']=='Asia') & 
                                     (df['model'].str.contains('RX')) &
                                     (df['brand']=='Ibanez')] = 'Japan'
    
    df['origin'].loc[(df['origin']=='Asia') & 
                                     (df['model'].str.contains('Prestige')) &
                                     (df['brand']=='Ibanez')] = 'Japan'
    
    df['origin'].loc[(df['origin']=='Asia') & 
                                     (df['model'].str.contains('Signature')) &
                                     (df['brand']=='Ibanez')] = 'Japan'
    
    df['origin'].loc[(df['origin']=='Asia') & 
                                     (df['model'].str.contains('Artcore')) &
                                     (df['brand']=='Ibanez')] = 'Korea'
    
def extract_origin(df):
    origin_dict = {
        'Asia':0,
        'China':0,
        'Russia':0,
        'Vietnam':0,
        'ChinaIndonesia':1,
        'Indonesia':1,
        'Vietnam':1,
        'Germany':2,
        'Korea':2,
        'Mexico':3,
        'Japan':3,
        'United States':4,
    }
    df['origin_code'] = (
        df.origin
          .str.extract('(' + '|'.join(origin_dict.keys()) + ')')
          .squeeze().map(origin_dict)
    )
    
    
    
    
def encode_type(df):
    conditions = [df.type=='Solid Body',
                  df.type=='Semi-Hollow',
                  df.type=='Hollow Body',
                  df.type=='Other'
                 ]
    
    values = [3,2,1,0]
    df['type_code'] = np.select(conditions, values)

# encode Condition
def reduce_condition(df):
    conditions = [df.condition.str.contains('Mint'), 
                  df.condition.str.contains('Excellent'),
                  df.condition.str.contains('Very Good'),
                  df.condition.str.contains('Good'),
                  df.condition.str.contains('Fair'),
                  df.condition.str.contains('Poor')
                 ]
    
    values = ['Mint','Excellent','Very Good','Good','Fair','Poor']
    df['condition'] = np.select(conditions, values)
    
def encode_condition(df):
    conditions = [df.condition.str.contains('Mint'), 
                  df.condition.str.contains('Excellent'),
                  df.condition.str.contains('Very Good'),
                  df.condition.str.contains('Good'),
                  df.condition.str.contains('Fair'),
                  df.condition.str.contains('Poor')
                 ]
    
    values = [5,4,3,2,1,0]
    df['cond_score'] = np.select(conditions, values, default=3)


# cleaning up types to prep for encoding body type
def clean_type(df): 
    ## cleaning up PRS types
    df['type'].loc[(df.type=='unknown') &
                            (df.brand=='PRS')]='PRS style'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Silver Sky'))]='Stratocaster'
    
    #df_under10k['type'].loc[(df_under10k.type=='unknown') &
    #                        (df_under10k.title.str.contains('McCarty'))]='PRS style'
    
    
    ## capturing all Fender types
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Stratocaster'))]='Stratocaster'
    
    df['type'].loc[(df.type=='unknown') &
    df(df.title.str.contains('Tele'))]='Telecaster'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Strat'))]='Stratocaster'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Jazzmaster'))]='Jazzmaster'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Jaguar'))]='Jaguar'
    
    ## capturing all ibanez types
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Ibanez'))]='Stratocaster'
    #df_under10k['type'].loc[(df_under10k.type=='unknown') &
    #                        (df_under10k.title.str.contains('Statocaster'))]='Strat'
    
    ## capturing all Gibson types
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Les Paul'))]='Les Paul'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('SG'))]='SG'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Flying'))]='Flying V'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('ES'))]='ES'
    
    df['type'].loc[(df.type=='unknown') &
                            (df.title.str.contains('Casino'))]='ES'
    
    df['type'].loc[(df.type=='unknown') &
                        (df.title.str.contains('Sheraton'))]='ES'

# feature engineer body_code 
def reduce_types(df): 
    body_dict = {
                'unknown':0,     
                'Stratocaster':4,
                'ES':4,          
                'PRS style':2,   
                'Telecaster':5,  
                'Les Paul':6,    
                'SG':3,          
                'Jazzmaster':1,  
                'Coronado':1,    
                'Mustang':1,           
                'Esquire':2,           
                'Cyclone':2,
                    }
    df['body_code'] = (
        df.type
          .str.extract('(' + '|'.join(body_dict.keys()) + ')')
          .squeeze().map(body_dict)
    )
    
    
def clean_raw_data(df):
    '''
    This function will clean and prepare the webscraped data for modeling
    input: webscraped data as df
    output: clean, model-ready data
    '''
    # copy df first
    model_df = df.copy()
    expand(model_df)
    
    # remove data with no price
    model_df = model_df[model_df.price!='unknown']
    
    # Clean price 
    model_df['price'] = model_df['price'].progress_apply(clean_price)
    
    # limit data to >300 and <10000
    model_df = model_df[model_df.price < 10000]
    model_df = model_df[model_df.price > 300]
    
    # encode price
    encode_price(model_df)
    
    
    # extract origin and drop nulls
    clean_corigin(model_df)
    extract_origin(model_df)
    model_df = model_df[model_df['origin_code'].notna()]
    
    # reduce and encode condition
    reduce_condition(model_df)
    encode_condition(model_df)
    
    # clean up types
    clean_type(model_df)
    
    # reduce and encode type
    reduce_types(model_df)
    
    return model_df
    

In [None]:
clean_updated_df = clean_raw_data(update_df)

<b>Combine the old and new data</b>

In [None]:
new_model_df = pd.concat([main_df,clean_updated_df])

<b>Create the new model and save pkl file</b>

In [None]:
def runModel_compare(df,features_in):
    X = df[features_in] 
    Y = df['price']

    #from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=101)

    #from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_train,y_train)

    # print the intercept
    # print(model.intercept_)

    coeff_parameter = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
    #coeff_parameter

    predictions = model.predict(X_test)
    #predictions

    ## plotting the data
    #sns.regplot(x='X-Axis', y='Y-Axis', data=df, scatter_kws={"color": "red"}, line_kws={"color": "green"})
    p = sns.regplot(y_test,predictions,scatter_kws={"color": "blue"}, line_kws={"color": "red"}).set(title=f'Model using {features_in}')

    plt.xlabel("Guitar Prices")
    plt.ylabel(f'Features')
    

    #from sklearn.metrics import mean_squared_error
    #import math

    actual = y_test
    predicted = predictions

    mse = mean_squared_error(actual, predicted)
    rmse = math.sqrt(mse)

    model.fit(X_train,y_train)
    
    print('--- Scores: ---')
    print(f'Model Score: {model.score(X_train,y_train)}')
    print(f'MSE: {mse} \nRMSE: {rmse}')
    
    X_train_Sm= sm.add_constant(X_train)
    X_train_Sm= sm.add_constant(X_train)
    ls=sm.GLS(y_train,X_train_Sm).fit()
    print(f'Adj. R-Squared: {ls.rsquared_adj}')
    print(f'R-Squared: {ls.rsquared}')
    #print(f'Mean Squared Error: {ls.mse_model}')
    print(f'Log-likelihood: {ls.llf}')    
    print('')
    print('--- Comparison ---')
    #manually test accuracy
    df_results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
    #df_results['accuracy'] = df_results.Predicted / df_results.Actual if (df_results.Predicted / df_results.Actual)<1.0 else 
    df_comparison = pd.merge(df_results,modeling_data, how="inner", left_on="Actual", right_on="price")
    #df_comparison.columns

    pred_df = df_comparison[['title',
                   'Predicted', 
                   'Actual',
                   'brand',
                   'origin',
                   'condition']]
    pd.options.display.float_format = '{:,.2f}'.format
    compare_vals = pred_df[pred_df.Actual<5000].value_counts().head(20)
    print(compare_vals)
    print('--- Regression Plot --- ')
    
    # save the model to disk
    # create an iterator object with write permission - model.pkl
    with open('guitar.pkl', 'wb') as files:
        pickle.dump(model, files)

In [None]:
## set which features will go into model
features_in = ['type_code','origin_code','cond_score','body_code','price_code']

## create the model with the new data
runModel_compare(new_model_df,features_in)

<b>Replace old model in AWS with new model using new data </b>

In [None]:
#Creating Session With Boto3.
session = boto3.Session(
aws_access_key_id = AWS_key_id,
aws_secret_access_key =  AWS_secret_key,
region_name = 'us-east-1'
)

#Creating S3 Resource From the Session.
#s3 = session.resource('s3')

#txt_data = b'This is the content of the file uploaded from python boto3 asdfasdf'

#object = s3.Object('dataforguitarapp', '../models/guitar.pkl')

#result = object.put(Body=txt_data)




s3_resource = session.resource('s3')

bucket='dataforguitarapp'
key= 'guitar.pkl'

pickle_byte_obj = pickle.dumps(model)

s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

<b>Replace old master data with new master data using combined data </b>

In [None]:
# code here

# 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=110fa109-4784-4901-832e-96131bae9e6e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>