### Imports, Constants, Common Functions

In [None]:
from IPython.core.getipython import get_ipython
from IPython.display import display, Markdown, Latex
import pandas as pd
import json 

DEV_MODE = False
RESULT_DIR = './analysis_results'
if DEV_MODE:
    RESULT_DIR = RESULT_DIR + '_dev'
RESULT_FILE_PREFIX = RESULT_DIR +'/module_11_01.'
STEP01_DATA_UNDERSTANDING = 'step01.data_understanding.'
STEP02_DATA_PREPARATION = 'step02.data_preparation.'
STEP03_MODELING = 'step03.modeling.'
STEP04_EVALUATION = 'step04.evaluation.'

def writeString2File(string2Write, path, print2Screen = True):
    if print2Screen:
        print(string2Write)
    with open(path, "w") as text_file:
        text_file.write(str(string2Write))

def readFile(path):
    with open(path) as f: 
        data = f.read()
    return data

def readJson(path):
    data = readFile(path)
    return json.loads(data)

def addMarkdownImage(name, path, asMarkdown=False):
    if asMarkdown:
        out = f'![{name}]({path})'
    else:
        out = f'<a href="{path}" target="_blank"><img src="{path}"/></a>'

    return out

content = '# Report: What drives the price of a car?\n\n'

def printDataFrameInfo(name, stage):
    out = ''
    out += f'### {name}\n\n'
    out += '<table>'
    out += '<tr><th>info()</th><th>describe()</th></tr>'

    prefix = RESULT_FILE_PREFIX + stage
    rawDfInfo = readFile(prefix + 'data.info.txt')
    rawDfDescribe = readFile(prefix + 'data.describe.txt')
    imgPath = prefix + 'data.distribution.png'
    
    rawDfStatsImage = addMarkdownImage(name, imgPath)
    out += '<tr>'
    out += f'<td><pre>{rawDfInfo}</pre></td>'
    out += f'<td><pre>{rawDfDescribe}</pre></td>'
    out += '</tr>'
    out += '<tr>'
    out += f'<td colspan="2">\n{rawDfStatsImage}\n</td></tr>'
    out += '</tr>'
    out += '</table>\n\n'

    return out


### Code

In [None]:
# NOTE: This will fail for large dataset processing or complex model evaluation
# Use the command below to run it in the background 
##############################
# Source: https://www.maksimeren.com/post/screen-and-jupyter-a-way-to-run-long-notebooks-headless/
# jupyter nbconvert --to notebook --execute used_car_price_analysis.template.ipynb --output=used_car_price_analysis.out.ipynb --ExecutePreprocessor.timeout=-1
content += '**Code:** [Data Analysis Workbook](./used_car_price_analysis.out.ipynb)\n\n'
content += '**NOTE:** *The processing of the juypter notebook take a long time and often disconnects from the session. In order to run it without monitoring it all the time use the wokraround below from command line.*\n\n'
content += '```\n'
content += 'jupyter nbconvert --to notebook --execute used_car_price_analysis.template.ipynb --output=used_car_price_analysis.out.ipynb --ExecutePreprocessor.timeout=-1\n'
content += '```\n'
content += '<sub>Source: [screen-and-jupyter-a-way-to-run-long-notebooks-headles](https://www.maksimeren.com/post/screen-and-jupyter-a-way-to-run-long-notebooks-headless/)</sub>\n\n'

content += 'Jump to the good stuff: [Recommendations](#Recommendations)\n\n'

### Business Understanding

In [None]:
#content += '\n\n'
content += '## Business Understanding\n\n'

content += 'We are provided with a dataset of used car prices and features about that particular vehicle. Our final goal will be to identify which \'features\' AND what values of those features most contribute to the final price both positively and negatively.\n\n'
content += 'Because the data has high dimensionality we will need to make use of transformers to get the data ready for use in regularization. Once data is cleaned and prepared we will then try out multiple linear regression models to find the best one. Once done we will use the coefficients to identify how features contribute to price.\n\n'
content += 'Once we have found these imortant features we will write up actionable guidance for used car business\n\n'
content += '**Steps involved:**\n\n'
content += '  - Examine the raw data and identify characterisitics of the data e.g. missing values, unique counts, invalid data...\n'
content += '  - Preprocess the data to get it ready for modelling by:\n'
content += '    - Identifying which features can be ignored and drop those columns/features\n'
content += '    - Identify non-ignorable missing features and either:\n'
content += '      - Impute missing values per row\n'
content += '      - Drop those rows\n'
content += '  - Decide what data transforms/normalization are required for numeric and categorical fields based on above decisions\n'
content += '  - Use regularization techniques with multiple (L1, L2,...) linear regression models using and find one with the best peformance for predicting prices\n'
content += '  - Repeat steps above if necessary to arrive at final \'best\' model which we will measure by using the one with the lowest Mean Square Error (MSE)\n'
content += '  - Analyse the most important features \'selected\' by the model based on the coefficients determined by the previous steps\n'

### Data Understanding

In [None]:
# Data Understanding
content += '## Data Understanding\n\n'
content += printDataFrameInfo('Raw Data Statistics', STEP01_DATA_UNDERSTANDING)

fieldNotes = {
    'id': {
        'notes': [
            'Not useful for predictions.'
        ]
    },
    'VIN': {
        'notes': [
            'Not useful for predictions.'
        ]
    },
    'price': {
        'notes': [
            'Target Field.',
            'Need to deal with outliers.'
        ]
    },
    'odometer': {
        'notes': [
            'Has an effect on price typically negative as mileage goes up.',
            'Need to deal with outliers.'
            'There are only a small percentage of values missing.'
        ]
    },
    'manufacturer': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here and no easy way to determine them.',
            'There are only a small percentage of values missing.'
        ]
    },
    'model': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here and no easy way to determine them.',
            'There are only a small percentage of values missing.',
            'Free text field and there could be spelling mistakes or variations in order of words that aren\'t easy to normalize.'
        ]
    },
    'type': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here.',
            'Can use manufacturer, model and year to fill in missing values'
        ]
    },
    'drive': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here.',
            'Can use manufacturer, model and year to fill in missing values'
        ]
    },
    'transmission': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here.',
            'There are only a small percentage of values missing.'
        ]
    },
    'size': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here.',
            'Can use manufacturer, model and year to fill in missing values'
        ]
    },
    'cylinders': {
        'notes': [
            'Has an effect on price.',
            'There are lots of empty values here.',
            'Can use manufacturer, model and year to fill in missing values'
        ]
    },
    'fuel': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here.',
            'There are only a small percentage of values missing.'
        ]
    },
    'paint_color': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here and no easy way to determine them.'
        ]
    },
    'condition': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here and no easy way to determine them.'
        ]
    },
    'title_status': {
        'notes': [
            'Has an effect on price.',
            'There are empty values here.',
            'There are only a small percentage of values missing.'
        ]
    },
    'year': {
        'notes': [
            'Has an effect on price typically positive as value goes up since it\'s a newer car.',
            'Need to deal with outliers.'
        ]
    },
    'state': {
        'notes': [
            'Has an effect on price.',
            'Not really something dealer can control but can extract some useful information from this for other business decision making.'
        ]
    },
    'region': {
        'notes': [
            'Has an effect on price.',
            'Not really something dealer can control but can extract some useful information from this for other business decision making.'
        ]
    }
}


content += '### Analysis\n\n'
content += " There are a lot of categorical columns that will need to be encoded."
content += " There are also a lot of missing values for fields that will likely be important to the model." 
content += " We'll have to impute where we can and drop where it won't affect the size of the dataset too much.\n\n"


dataReportDf = pd.read_excel(RESULT_FILE_PREFIX + STEP02_DATA_PREPARATION + 'data.frame.xlsx')
content += '<table>\n'
content += '<tr>\n'
content += '<th></th>'
content += '<th>Field</th>'
content += '<th>Type</th>'
content += '<th>Missing Value #</th>'
content += '<th>Missing Value %</th>'
content += '<th>Unique Value #</th>'
content += '<th>Notes</th>'
content += '</tr>\n'
i=1
stage = 'raw'
for f in fieldNotes:
    fieldNode = fieldNotes[f]
    fieldDf = dataReportDf.query('stage == @stage and column == @f').iloc[0]
    dataType = fieldDf['data_type']
    missingValueCount = fieldDf['na_value_count']
    missingValuePct = fieldDf['na_value_pct']
    uniqueValueCount = fieldDf['unq_value_count']
    value = fieldNode['notes']
    content += '<tr>\n'
    content += f'<td>{i}</td>'
    content += f'<td>{f}</td>'
    content += f'<td>{dataType}</td>'
    content += f'<td>{missingValueCount}</td>'
    content += f'<td>{missingValuePct}</td>'
    content += f'<td>{uniqueValueCount}</td>'
    processing = '</li>\n<li>'.join(value)
    content += f'<td><ul><li>{processing}</ul></td>'
    content += '</tr>\n'
    i += 1
content += '</table>\n\n'

### Data Preparation

In [None]:
# Data Preparation
content += '## Data Preparation\n\n'
content += '### Cleanup Approach\n\n'
dataPrepPipeline = readJson(RESULT_FILE_PREFIX + STEP02_DATA_PREPARATION + '.request.json')
operations = dataPrepPipeline['operations']
fieldProcessingPipeline = []
for node in operations:
    actions = []
    fieldProcessingPipeline.append(actions)
    operationName = node['operation']
    configNode = node['config']
    for f in configNode:
        fieldConfigNode = configNode[f]
        displayAction = 'Error'        
        if operationName == 'drop':
            displayAction = f'Drop the {f} feature column'
        elif operationName == 'dropna':
            displayAction = f'Drop rows where {f} is empty'
        elif operationName == 'fillna':
            replaceValue = fieldConfigNode['value']
            displayAction = f'Fill rows where {f} is empty with "{replaceValue}"'
        elif operationName == 'queryFilter':
            query = fieldConfigNode['query']
            displayAction = f'Drop rows meeting the critera "{query}"'
        elif operationName == 'iqr':
            q3Threshold = 0.75
            q1Threshold = 0.25
            if 'q3%' in fieldConfigNode:
                q3Threshold = float(fieldConfigNode['q3%']) / 100
            if 'q1%' in fieldConfigNode:
                q1Threshold = float(fieldConfigNode['q1%']) / 100
            displayAction = f'Drop rows not meeting criteria Q1[{q1Threshold}] <= {f} <= Q3[{q3Threshold}]'
        elif operationName == 'toLowerCase':
            displayAction = f'Convert {f} to lower case values'
            targetField = f;
            if 'field' in fieldConfigNode:
                targetField = fieldConfigNode['field']
                if targetField != f:
                    displayAction += f' and store value in new field {targetField}'
            if 'removeSpaces' in node:
                if fieldConfigNode['removeSpaces'] == True:
                    displayAction += f' and remove all spaces'
        elif operationName == 'fillUsingMode':
            lookupFields = ', '.join(fieldConfigNode['lookupFields'])
            fillFields = ', '.join(fieldConfigNode['fillFields'])
            displayAction = f'Using the fields {lookupFields} find teh mode() for those fields in the dataset and assign to {f}'
        else:
            raise Exception(f"Unsupported operation: {operation}")

        actions.append(displayAction)

content += '<table>\n'
content += '<tr>\n'
content += '<th>Step</th>'
content += '<th>Processing</th>'
content += '</tr>\n'
i=1
for value in fieldProcessingPipeline:
    content += '<tr>\n'
    content += f'<td>Step{i}</td>'
    processing = '</li>\n<li>'.join(value)
    content += f'<td><ul><li>{processing}</ul></td>'
    content += '</tr>\n'
    i += 1
content += '</table>\n\n'

content += '### Data Shape vs Processing Steps\n\n'
content += '<table>\n'
img = addMarkdownImage('Row Counts', RESULT_FILE_PREFIX + STEP02_DATA_PREPARATION + 'row_count.png')
content += f'<tr><td>{img}</td></tr>\n'
img = addMarkdownImage('Missing Value Count', RESULT_FILE_PREFIX + STEP02_DATA_PREPARATION + 'missing_count.png')
content += f'<tr><td>{img}</td></tr>\n'
img = addMarkdownImage('Missing Value %', RESULT_FILE_PREFIX + STEP02_DATA_PREPARATION + 'missing_percentage.png')
content += f'<tr><td>{img}</td></tr>\n'
img = addMarkdownImage('Missing Value %', RESULT_FILE_PREFIX + STEP02_DATA_PREPARATION + 'unique_values.png')
content += f'<tr><td>{img}</td></tr>\n'
content += '</table>\n\n'


content += printDataFrameInfo('Prepared Data Statistics', STEP02_DATA_PREPARATION)

### Modeling

In [None]:
# Modeling
content += '## Modeling\n\n'
modelReportDf = pd.read_excel(RESULT_FILE_PREFIX + STEP03_MODELING + 'model_report.data.frame.xlsx')
useMetrics = ['Test MSE']
useMetricsAscending = [True]
bestModelDf = modelReportDf.sort_values(useMetrics, ascending=useMetricsAscending).iloc[0]
bestModel = bestModelDf['Model']
bestModelMetrics = []
for k in useMetrics:
    bestModelMetrics.append(f'**{k}={bestModelDf[k]}**')
bestModelStats = ' and '.join(bestModelMetrics)
categoricalFeatures = bestModelDf['Categorical Features']
numericalFeatures = bestModelDf['Numerical Features']
modelsTried = ', '.join(modelReportDf['Model'].values)
content += '### Model Analysis\n\n'
content += f'Using the following features \n\n'
content += f'- Categorical={categoricalFeatures} \n\n'
content += f'- Numerical={numericalFeatures} \n\n'
content += f'we tried several regression models including **{modelsTried}** \n\n'
modelPerformanceImage = addMarkdownImage('Model Performance', RESULT_FILE_PREFIX + STEP03_MODELING + 'performance.png')
content += f'{modelPerformanceImage} \n\n'
content += f'We have determined that the **best model** is **{bestModel}**'
content += f' based on {bestModelStats}. We chose Test MSE because while it is more sensitive to outliers we\'ve removed the outliers using IQR filtering. Had we not done this we would have used R2\n\n'



### Evaluation

In [None]:
# Evaluation

content += '## Evaluation\n\n'
content += '### Feature Results\n\n'
content += f'We will now show the importances of all the features across models\n\n'
content += '<table>\n'
img = addMarkdownImage('Feature Importance', RESULT_FILE_PREFIX + STEP04_EVALUATION + 'coefficient.png')
content += f'<tr><td>{img}</td></tr>\n'
content += '</table>\n\n'

content += '### Feature Analysis\n\n'
content += 'We can see from this that the models generally agree on what one would expect:\n'
content += '- Model also contributes positively. This makes sense as people prefer some cars over others, but it was hard to get exact models since the cardinality is so high. This can be inferred from combination of other features however\n'
content += '- As year goes up price goes up. This makes sense as you are getting a newer car\n'
content += '- As odometer goes up price goes down. This makes sense as you are getting a car with a lot more miles, wear and tear\n'
content += '- Interestingly fuel, cylinders, region and drive also feature prominently. Cylinders made sense because you pay for more horsepower. But region has heavy influence\n'
content += '- Looking at other features we can see prefernces for types (truck+pickups > sedan) and drive (4wd > fwd)\n'


### Recommendations

In [None]:
content += '## Recommendations\n\n'

content += '### Specific Features People Value\n\n'
content += '<table>\n'
img = addMarkdownImage('Feature Importance', RESULT_FILE_PREFIX + STEP04_EVALUATION + 'pos.coeff.png')
content += f'<tr><td>{img}</td></tr>\n'
content += '</table>\n\n'

content += 'Do Prioritize:\n\n'
content += '- Manufacturers=Toyota, Honda, Lexus, Tesla for the commonn brands\n'
content += '- Type=Pickups, Convertibles, Coupes and Trucks\n'
content += '- Size=Full Size, Mid Size\n'
content += '- Drive=4WD\n'
content += '- Cylinders=8\n'
content += '- Transmission=Manual\n'
content += '- Fuel=Diesel\n'
content += '- Title=Clean\n\n'
content += 'This means move these to the front of your lot and they may benefit from a markup or marketing\n\n'


content += 'If you are thinking of moving inventory the cars earn more in:\n\n'
content += '- States=ak, mt, wa, co, ca...\n'
content += 'Perhaps you can move cars within these positive feature regions to maximize sale price\n\n'

content += '### Specific Features People DO NOT Value\n\n'
content += '<table>\n'
img = addMarkdownImage('Feature Importance', RESULT_FILE_PREFIX + STEP04_EVALUATION + 'neg.coeff.png')
content += f'<tr><td>{img}</td></tr>\n'
content += '</table>\n\n'

content += 'Do NOT Prioritize:\n\n'
content += '- Manufacturers=Dodge, Kia, Nissan, Mitsubishi,  for the commonn brands\n'
content += '- Type=Sedan, Hatchback, SUV, Wagon\n'
content += '- Size=Compact, Sub-Compact\n'
content += '- Drive=FWD\n'
content += '- Cylinders=4 and lower\n\n'
content += 'This means move these to the back of your lot it may be worthwhile getting rid of these quickly from your lot\n\n'

content += 'If you are thinking of moving inventory the cars earn less in:\n\n'
content += '- States=me, fl, ny, nh, il, ....\n'
content += 'Perhaps you can move cars with negative features to better performing regions to benefit from the markup\n\n'


content += 'Using the charts above to make decisions about what types of features about the vehicle to prioritize in your inventory\n\n'


### Output

In [None]:
display(Markdown(content))
writeString2File(string2Write=content, path='./README.md',print2Screen=False)