In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import pearsonr
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency

In [58]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",header=None,names=['symboling','normalizedLosses','make','fuelType','aspiration','numofdoors','bodyStyle','driveWheels','engineLocation','wheelBase','length','width','height','curbWeight','engineType','cylinders','engineSize','fuelSystem','bore','stroke','compRatio','horsepower','peak_rpm','city_mpg','highway_mpg','price'],na_values='?')

In [59]:
df.head()

Unnamed: 0,symboling,normalizedLosses,make,fuelType,aspiration,numofdoors,bodyStyle,driveWheels,engineLocation,wheelBase,...,engineSize,fuelSystem,bore,stroke,compRatio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [60]:
df.isnull().sum()

symboling            0
normalizedLosses    41
make                 0
fuelType             0
aspiration           0
numofdoors           2
bodyStyle            0
driveWheels          0
engineLocation       0
wheelBase            0
length               0
width                0
height               0
curbWeight           0
engineType           0
cylinders            0
engineSize           0
fuelSystem           0
bore                 4
stroke               4
compRatio            0
horsepower           2
peak_rpm             2
city_mpg             0
highway_mpg          0
price                4
dtype: int64

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   normalizedLosses  164 non-null    float64
 2   make              205 non-null    object 
 3   fuelType          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   numofdoors        203 non-null    object 
 6   bodyStyle         205 non-null    object 
 7   driveWheels       205 non-null    object 
 8   engineLocation    205 non-null    object 
 9   wheelBase         205 non-null    float64
 10  length            205 non-null    float64
 11  width             205 non-null    float64
 12  height            205 non-null    float64
 13  curbWeight        205 non-null    int64  
 14  engineType        205 non-null    object 
 15  cylinders         205 non-null    object 
 16  engineSize        205 non-null    int64  
 1

In [62]:
df.normalizedLosses.fillna(df.normalizedLosses.mean(),inplace=True)

In [63]:
df.numofdoors.fillna(df.numofdoors.mode().max(),inplace=True)

In [64]:
df.bore.fillna(df.bore.mean(),inplace=True)

In [65]:
df.stroke.fillna(df.stroke.mean(),inplace=True)

In [66]:
df.horsepower.fillna(df.horsepower.mean(),inplace=True)

In [67]:
df.peak_rpm.fillna(df.peak_rpm.mean(),inplace=True)

In [68]:
df.price.fillna(df.price.mean(),inplace=True)

In [69]:
##### Let's check once again the dataset contains any null or not:

In [70]:
df.isnull().sum()

symboling           0
normalizedLosses    0
make                0
fuelType            0
aspiration          0
numofdoors          0
bodyStyle           0
driveWheels         0
engineLocation      0
wheelBase           0
length              0
width               0
height              0
curbWeight          0
engineType          0
cylinders           0
engineSize          0
fuelSystem          0
bore                0
stroke              0
compRatio           0
horsepower          0
peak_rpm            0
city_mpg            0
highway_mpg         0
price               0
dtype: int64

In [71]:
pearsonr(df.symboling,df.price)

(-0.08220143736318973, 0.24130433124264047)

In [72]:
pearsonr(df.normalizedLosses,df.price)

(0.13399873429274337, 0.05543057826318259)

In [73]:
pearsonr(df.wheelBase,df.price)

(0.5831681499789547, 4.527625545686765e-20)

In [74]:
pearsonr(df.length,df.price)

(0.6829862954386219, 1.6498873291218535e-29)

In [75]:
pearsonr(df.width,df.price)

(0.7286988175931839, 3.214520483804664e-35)

In [76]:
pearsonr(df.height,df.price)

(0.13438751871051807, 0.05471982048424776)

In [77]:
pearsonr(df.curbWeight,df.price)

(0.820824733354729, 2.8663321070270534e-51)

In [78]:
pearsonr(df.engineSize,df.price)

(0.8617522436859721, 9.669661923634776e-62)

In [79]:
pearsonr(df.bore,df.price)

(0.5323000195387407, 2.1398306512855486e-16)

In [80]:
pearsonr(df.stroke,df.price)

(0.08209529554134327, 0.24191456857746316)

In [81]:
pearsonr(df.compRatio,df.price)

(0.07099036277901556, 0.31178249194905244)

In [82]:
pearsonr(df.horsepower,df.price)

(0.7579169537498177, 1.6076703978130332e-39)

In [83]:
pearsonr(df.peak_rpm,df.price)

(-0.10085406591169085, 0.15019246954232154)

In [84]:
pearsonr(df.city_mpg,df.price)

(-0.6674492651600004, 8.463700497727807e-28)

In [85]:
pearsonr(df.highway_mpg,df.price)

(-0.6905257341183488, 2.23246740583743e-30)

In [86]:
stats.f_oneway(df.price[df.fuelType=='gas'],df.price[df.fuelType=='diesel'])

F_onewayResult(statistic=2.4958589760681935, pvalue=0.11570297526361666)

In [87]:
stats.f_oneway(df.price[df.aspiration=='std'],df.price[df.aspiration=='turbo'])

F_onewayResult(statistic=6.587289855140993, pvalue=0.010991046478632666)

In [88]:
stats.f_oneway(df.price[df.numofdoors=='four'],df.price[df.numofdoors=='two'])

F_onewayResult(statistic=0.35778516571700403, pvalue=0.5504056367624801)

In [89]:
stats.f_oneway(df.price[df.driveWheels=='fwd'],df.price[df.driveWheels=='rwd'],df.price[df.driveWheels=='4wd'])

F_onewayResult(statistic=67.5036668266912, pvalue=3.5392707916047903e-23)

In [90]:
stats.f_oneway(df.price[df.bodyStyle=='convertible'],df.price[df.bodyStyle=='hatchback'],df.price[df.bodyStyle=='sedan'],df.price[df.bodyStyle=='wagon'],df.price[df.bodyStyle=='hardtop'])

F_onewayResult(statistic=9.183927252573389, pvalue=7.844575720929884e-07)

In [91]:
stats.f_oneway(df.price[df.engineLocation=='front'],df.price[df.engineLocation=='rear'])

F_onewayResult(statistic=24.979629190446918, pvalue=1.2486063656822578e-06)

In [92]:
stats.f_oneway(df.price[df.engineType=='ohc'],df.price[df.engineType=='ohcf'],df.price[df.engineType=='ohcv'],df.price[df.engineType=='l'],df.price[df.engineType=='rotor'],df.price[df.engineType=='dohcv'])

F_onewayResult(statistic=8.880041876905345, pvalue=1.3919774978921531e-07)

In [93]:
stats.f_oneway(df.price[df.cylinders=='four'],df.price[df.cylinders=='six'],df.price[df.cylinders=='five'],df.price[df.cylinders=='eight'],df.price[df.cylinders=='two'],df.price[df.cylinders=='three'],df.price[df.cylinders=='twelve'])

F_onewayResult(statistic=45.72705376201727, pvalue=7.149269882584999e-35)

In [94]:
stats.f_oneway(df.price[df.fuelSystem=='mpfi'],df.price[df.fuelSystem=='2bbl'],df.price[df.fuelSystem=='idi'],df.price[df.fuelSystem=='1bbl'],df.price[df.fuelSystem=='spdi'],df.price[df.fuelSystem=='4bbl'],df.price[df.fuelSystem=='mfi'],df.price[df.fuelSystem=='spfi'])

F_onewayResult(statistic=14.797402090222409, pvalue=1.8657347748825717e-15)

In [95]:
df.drop(['symboling','normalizedLosses','make','height','numofdoors','stroke','compRatio','peak_rpm','fuelType'],axis=1,inplace=True)

In [96]:
df.head()

Unnamed: 0,aspiration,bodyStyle,driveWheels,engineLocation,wheelBase,length,width,curbWeight,engineType,cylinders,engineSize,fuelSystem,bore,horsepower,city_mpg,highway_mpg,price
0,std,convertible,rwd,front,88.6,168.8,64.1,2548,dohc,four,130,mpfi,3.47,111.0,21,27,13495.0
1,std,convertible,rwd,front,88.6,168.8,64.1,2548,dohc,four,130,mpfi,3.47,111.0,21,27,16500.0
2,std,hatchback,rwd,front,94.5,171.2,65.5,2823,ohcv,six,152,mpfi,2.68,154.0,19,26,16500.0
3,std,sedan,fwd,front,99.8,176.6,66.2,2337,ohc,four,109,mpfi,3.19,102.0,24,30,13950.0
4,std,sedan,4wd,front,99.4,176.6,66.4,2824,ohc,five,136,mpfi,3.19,115.0,18,22,17450.0


In [97]:
df.shape

(205, 17)

In [98]:
train = df.iloc[:165]

In [99]:
test = df.iloc[165:]

In [100]:
test.head()

Unnamed: 0,aspiration,bodyStyle,driveWheels,engineLocation,wheelBase,length,width,curbWeight,engineType,cylinders,engineSize,fuelSystem,bore,horsepower,city_mpg,highway_mpg,price
165,std,sedan,rwd,front,94.5,168.7,64.0,2265,dohc,four,98,mpfi,3.24,112.0,26,29,9298.0
166,std,hatchback,rwd,front,94.5,168.7,64.0,2300,dohc,four,98,mpfi,3.24,112.0,26,29,9538.0
167,std,hardtop,rwd,front,98.4,176.2,65.6,2540,ohc,four,146,mpfi,3.62,116.0,24,30,8449.0
168,std,hardtop,rwd,front,98.4,176.2,65.6,2536,ohc,four,146,mpfi,3.62,116.0,24,30,9639.0
169,std,hatchback,rwd,front,98.4,176.2,65.6,2551,ohc,four,146,mpfi,3.62,116.0,24,30,9989.0
