In [0]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [0]:
url="https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
autoMobileDataset=pd.read_csv(url)

In [0]:
autoMobileDataset.columns=["symboling","normalized_losses","make","fuel_type","aspiration","num_of_doors","body_style","drive_wheels","engine_location","wheel_base","length","width","height","curb_weight","engine_type","num_of_cylinders","engine_size","fuel_system","bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]

In [0]:
autoMobileDataset.replace('?',np.nan, inplace=True)

In [5]:
autoMobileDataset.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [6]:
autoMobileDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 26 columns):
symboling            204 non-null int64
normalized_losses    164 non-null object
make                 204 non-null object
fuel_type            204 non-null object
aspiration           204 non-null object
num_of_doors         202 non-null object
body_style           204 non-null object
drive_wheels         204 non-null object
engine_location      204 non-null object
wheel_base           204 non-null float64
length               204 non-null float64
width                204 non-null float64
height               204 non-null float64
curb_weight          204 non-null int64
engine_type          204 non-null object
num_of_cylinders     204 non-null object
engine_size          204 non-null int64
fuel_system          204 non-null object
bore                 200 non-null object
stroke               200 non-null object
compression_ratio    204 non-null float64
horsepower           202 non-nul

In [0]:
autoMobileDataset["normalized_losses"]=autoMobileDataset["normalized_losses"].astype(float)
autoMobileDataset["normalized_losses"].fillna(autoMobileDataset["normalized_losses"].mean(),inplace=True)


In [0]:
autoMobileDataset["peak_rpm"]=autoMobileDataset["peak_rpm"].astype(float)
autoMobileDataset["peak_rpm"].fillna(int(autoMobileDataset["peak_rpm"].mean()),inplace=True)


In [0]:
autoMobileDataset["horsepower"]=autoMobileDataset["horsepower"].astype(float)
autoMobileDataset["horsepower"].fillna(int(autoMobileDataset["horsepower"].mean()),inplace=True)
autoMobileDataset["stroke"]=autoMobileDataset["stroke"].astype(float)
autoMobileDataset["stroke"].fillna(int(autoMobileDataset["stroke"].mean()),inplace=True)
autoMobileDataset["bore"]=autoMobileDataset["bore"].astype(float)
autoMobileDataset["bore"].fillna(int(autoMobileDataset["bore"].mean()),inplace=True)
autoMobileDataset["price"]=autoMobileDataset["price"].astype(float)
autoMobileDataset["price"].fillna(int(autoMobileDataset["price"].mean()),inplace=True)

In [10]:
autoMobileDataset.head(2)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
1,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0


In [0]:
autoMobileDataset.dropna(subset=["num_of_doors"],inplace=True)

My target variable is city_mpg


In [0]:
X=autoMobileDataset.dropna(subset=["city_mpg"])
y=autoMobileDataset.iloc[:,-3:-2]

In [0]:
X2=pd.get_dummies(X,columns=["make","fuel_type","engine_location","aspiration","num_of_doors","body_style","drive_wheels","engine_type","num_of_cylinders","fuel_system"])

In [14]:
len(X2.columns)

76

In [0]:
pca = PCA(n_components=24)
pcaX2 = pca.fit_transform(X2)

In [16]:
pca.explained_variance_ratio_

array([9.94963261e-01, 3.80915893e-03, 1.19903730e-03, 1.48305423e-05,
       9.64964188e-06, 2.77821275e-06, 6.08971633e-07, 3.62170655e-07,
       1.04015663e-07, 8.48081812e-08, 4.07299537e-08, 1.64281653e-08,
       1.47328096e-08, 1.12369727e-08, 5.67283501e-09, 4.95366562e-09,
       3.79766911e-09, 3.06656244e-09, 2.56861519e-09, 2.21550492e-09,
       1.80699662e-09, 1.63680270e-09, 1.45805295e-09, 1.37628240e-09])

In [0]:
X_trainPCA,X_testPCA,y_trainPCA,y_testPCA=train_test_split(pcaX2,y,random_state=69)
X_train, X_test, y_train, y_test = train_test_split(X2, y,random_state=69)

In [0]:
regPCA = linear_model.LinearRegression() 
regPCA.fit(X_trainPCA,y_trainPCA)
predictPCA=regPCA.predict(X_testPCA)
reg = linear_model.LinearRegression() 
reg.fit(X_train,y_train)
predict=reg.predict(X_test)


In [19]:
print(f'Mean Squared Error with PCA: {mean_squared_error(y_testPCA,predictPCA)}')
print(f'Simple Mean Squared Error: {mean_squared_error(y_test,predict)}')

Mean Squared Error with PCA: 0.0008342788056003624
Simple Mean Squared Error: 1.571956753075225e-24
