## Predictive Modeling: Using Regression models to predict price of used cars.
This project aims to show the process of creating Machine Learning models based on Simple, Multiple and Polynomial Regressions.
In this project, R2 is used to evaluate the performance of the models. Pearson Coeficient's P-Value of <= 0.05 rejects the null hypotesis.

In [1]:
# HIDDEN
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import IPython.display

In [2]:
# HIDDEN
#ETL e Data Cleaning
def ETL(pathfile):
    df = pd.read_csv(pathfile)                                              #transforming csv file into a pandas dataframe
    df['car_name'] = df['manufacturer_name'] + '-' + df['model_name']       #Combining car manufacturer and car model in a single name
    df = df[(df['price_usd'] > 500) & (df['odometer_value'] > 1000)]        #Removing too low values for price and odometer.
    lista = []                                                              #
    for car in list(set(df['car_name'])):                                   #
        if len(df[df['car_name'] == car]) > 100:                            #
            lista.append(car)                                               # Removing cars that appear less than 100 times.
    df = df[df['car_name'].isin(lista)]                                     # 
    return df.drop(['feature_1','feature_2','feature_3','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','duration_listed','engine_has_gas','engine_type','has_warranty','is_exchangeable','location_region','number_of_photos','body_type','up_counter','feature_0','manufacturer_name','model_name'],axis=1)   #removing unwanted columns

#Preparing Data to be analised and plotted
def PrepData(df,carname):
    df = df[df['car_name'] == carname]
    TargetVar = df['price_usd']
    df = df.drop(columns=['price_usd'],axis=1)
    df.insert(0,'price_usd',TargetVar)
    for column in df.columns:                                       #
        if (df[column].values == df[column].values[0]).all():       # Removing columns that have unique value (e.g. car only has 2.0L engine)
            df = df.drop(columns=column)                            #
    dfobject = df.select_dtypes(include='object')
    df = df.select_dtypes(exclude='object')
    dummiesdf = pd.get_dummies(dfobject)
    mergeddf = pd.merge(df,dummiesdf,left_index=True,right_index=True)
    targetcol = mergeddf['price_usd']                                                            
    return mergeddf, targetcol, dfobject

#Boxplot categorical variables within the car data
def PlotCategoricalVariables(BPdfobj,targetcol): ## Show counts on plot ##better show car colors
    fig, axs = plt.subplots(ncols=len(list(BPdfobj)), figsize=(20,4),sharey=True)
    t = 0
    for column in list(BPdfobj):
        bx = sns.boxplot(x=column,y=targetcol,data=BPdfobj, ax=axs[t])
        bx.tick_params(axis='x',rotation=45)
        t += 1
    plt.show()
    plt.close()

#Simple Linear Regression Model and Plot
def SLRModelandPlot(df,targetcol):
    coefx = ListCoef(df,'n')
    x = df[[coefx[0]]].values
    m = LinearRegression().fit(x,targetcol)
    fig, axs = plt.subplots(ncols=2, figsize=(20,4))
    ax0 = sns.regplot(x=x,y=targetcol,data=df,ax=axs[0])
    ax1 = sns.residplot(x=x,y=targetcol,data=df,ax=axs[1])
    ax0.set_ylim(0,)
    ax0.set_xlim(x.min(),)
    plt.show()
    plt.close()
    print('Simple Linear Regression Model Equation:\n',float(m.coef_),'*',coefx,'+','(',m.intercept_,')')
    return x,m

#Multiple Linear Regression Model and Plot
def MLRModelandPlot(df,targetcol): #adapt to better show model equation
    coefx = ListCoef(df)
    x = df[coefx].values
    m = LinearRegression().fit(x,targetcol)
    plt.figure(figsize=(20,4))
    ax1 = sns.kdeplot(targetcol, color="r", label="Actual Value")
    sns.kdeplot(m.predict(x), color="b", label="Predicted Values", ax=ax1)
    plt.title('Actual (red) vs Predicted (blue) Price')
    plt.xlabel('Price')
    plt.ylabel('Proportion of Cars')
    plt.xlim(0,)
    plt.show()
    plt.close()
    print("Multiple Linear Regression Equation:")
    for a, b in zip(list(m.coef_),coefx):
        print("(",b,'*',a,')','+')
    print('(',m.intercept_,')')
    return x,m

#Polynomial Regression Model and Plot
def PRModelandPlot(df,targetcol): #adapt to get best degree
    coefx = ListCoef(df,'n')
    x = df[coefx[0]].values
    degrees = [2,3,4]
    models = []
    for dg in degrees:
        m = np.poly1d(np.polyfit(x, targetcol, dg))
        models.append(m)
        x_new = np.linspace(x.min(),x.max())
        y_new = m(x_new)
        plt.figure(figsize=(20,4))
        plt.plot(x, targetcol, '.', x_new, y_new, '-')
        ax = plt.gca()
        ax.set_facecolor((0.898, 0.898, 0.898))
        fig = plt.gcf()
        plt.show()
        plt.close()
        print('Polynomial Regression Model Equation:\n',m,'\nWhere x is:',coefx)
    return x, models[0],models[1],models[2]

#R2 calculation
def R2(realY,predictY):
    return r2_score(realY,predictY)

#RMSE calculation
def RMSE(realY,predictY):
    return mean_squared_error(realY,predictY,squared=False)

#Comparison of best model by R2 criteria, higher = better.
def ModelComparison(x1,m1,x2,m2,x3,m31,m32,m33,targetcol):
    print('\n-------------------------------------------------------------------\n')
    md = ['Simple Linear Regression','Multiple Linear Regression','Polynomial Regression 2 degrees','Polynomial Regression 3 degrees','Polynomial Regression 4 degrees']
    preds = [m1.predict(x1),m2.predict(x2),m31(x3),m32(x3),m33(x3)]
    r2s = []
    rmses = []
    for pred, q in zip(list(preds), md):
        r2 = R2(targetcol,pred)
        rmse = RMSE(targetcol,pred)
        r2s.append(r2)
        rmses.append(rmse)
        print(q,'\nR2:',r2,'\nRMSE:',rmse,'\n')
    dictr2 = dict(zip(r2s, md))
    dictrmse = dict(zip(rmses, md))
    bestr2 = max(r2s)
    bestrmse = min(rmses)
    print('Best model by R2 is:',dictr2[bestr2],'with R2:',bestr2)
    print('Best model by RMSE is:',dictrmse[bestrmse],'with RMSE:',bestrmse)

#Analysis Function
def PlotandAnalysis(df,targetcol,BPdfobj):
    PlotCategoricalVariables(BPdfobj,targetcol)
    slrx, slrm = SLRModelandPlot(df,targetcol)
    mlrx, mlrm = MLRModelandPlot(df,targetcol)
    prx, prm1, prm2, prm3 = PRModelandPlot(df,targetcol)
    ModelComparison(slrx,slrm,mlrx,mlrm,prx,prm1,prm2,prm3,targetcol)

#Pearson coefficient calculation
def AbsCoefPearson(df):
    coeflist = []
    pvaluelist = []
    for col in list(df.columns):
        coef, p = stats.pearsonr(df[col],df['price_usd'])
        coeflist.append(round(coef,6))
        pvaluelist.append(round(p,3))
    df = pd.DataFrame(coeflist).merge(pd.DataFrame(pvaluelist),on=df.columns).rename(columns={'key_0':'IndependentVar','0_x':'PearsonCoef','0_y':'P-Value'}).drop([0])
    df['PearsonCoef'] = df['PearsonCoef'].abs()
    df = df.drop_duplicates(subset=['PearsonCoef'],keep='first')
    return df[df['P-Value'] <= 0.05].reset_index(drop=True) #returning only variables with p <=0.05

def ListCoef(df,todos='s'):
    df = AbsCoefPearson(df)
    if (todos == 's') is True:
        return list(df['IndependentVar'])
    else:
        df = df.loc[df['PearsonCoef'].nlargest(1,keep='first').index]
        return list(df['IndependentVar'])

def UsedCarSalesPriceAnalysis(pathfile):
    data = ETL(pathfile)
    dropdown_manufacturer = widgets.Dropdown(description='Select a Car:',options = sorted(list(set(data['car_name']))))
    output = widgets.Output()

    def output_by_manufacturer(df,carname):
        with output:
            IPython.display.clear_output(wait=True)
            display(PlotandAnalysis(*PrepData(df,carname)))     

    def dropdown_state_eventhandler(change):
            output_by_manufacturer(data,change.new) 

    dropdown_manufacturer.observe(dropdown_state_eventhandler, names='value')
    input_widgets = widgets.HBox([dropdown_manufacturer])
    display(input_widgets,output)

In [3]:
# HIDDEN
UsedCarSalesPriceAnalysis('cars.csv')

HBox(children=(Dropdown(description='Select a Car:', options=('Audi-100', 'Audi-80', 'Audi-A4', 'Audi-A6', 'Au…

Output()