# Problem Statement

### Our goal is to predict the co2 emission on the basis of various parameter of vehicles

# Importing Libraries

In [12]:
import pandas as pd  
import numpy as np 

#Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

#splitting
from sklearn.model_selection import train_test_split

#Feature Selection
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

#Model Evaluation
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score



# Data Gathering

In [2]:
def Data_loader(path):
    data=pd.read_csv(path)
    return data

In [3]:
df=Data_loader('data/co2.csv')
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


# Define X and Y

In [5]:
x=df.drop(['CO2 Emissions(g/km)'],axis=1)
y=df['CO2 Emissions(g/km)']

# Preprocessing 

In [14]:
cat=x.select_dtypes(include='object').columns
con=x.select_dtypes(exclude='object').columns

num_pipe=Pipeline(steps=[('Imputer',SimpleImputer(strategy='median')),('scalar',MinMaxScaler())])
cat_pipe=Pipeline(steps=[('Imputer',SimpleImputer(strategy='most_frequent')),('encoder',OrdinalEncoder())])

preprocessor=ColumnTransformer([('num_pipe',num_pipe,con),('cat_pipe',cat_pipe,cat)])

x1=pd.DataFrame(preprocessor.fit_transform(x),columns=preprocessor.get_feature_names_out())

x1

Unnamed: 0,num_pipe__Engine Size(L),num_pipe__Cylinders,num_pipe__Fuel Consumption City (L/100 km),num_pipe__Fuel Consumption Hwy (L/100 km),num_pipe__Fuel Consumption Comb (L/100 km),num_pipe__Fuel Consumption Comb (mpg),cat_pipe__Make,cat_pipe__Model,cat_pipe__Vehicle Class,cat_pipe__Transmission,cat_pipe__Fuel Type
0,0.146667,0.076923,0.215909,0.162651,0.200000,0.379310,0.0,1057.0,0.0,14.0,4.0
1,0.200000,0.076923,0.265152,0.222892,0.250000,0.310345,0.0,1057.0,0.0,25.0,4.0
2,0.080000,0.076923,0.068182,0.108434,0.081818,0.637931,0.0,1058.0,0.0,22.0,4.0
3,0.346667,0.230769,0.321970,0.307229,0.318182,0.241379,0.0,1233.0,11.0,15.0,4.0
4,0.346667,0.230769,0.299242,0.283133,0.295455,0.275862,0.0,1499.0,11.0,15.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
7380,0.146667,0.076923,0.246212,0.222892,0.240909,0.327586,41.0,1951.0,11.0,17.0,4.0
7381,0.146667,0.076923,0.265152,0.259036,0.263636,0.310345,41.0,1957.0,11.0,17.0,4.0
7382,0.146667,0.076923,0.284091,0.277108,0.281818,0.275862,41.0,1960.0,11.0,17.0,4.0
7383,0.146667,0.076923,0.265152,0.259036,0.263636,0.310345,41.0,1968.0,12.0,17.0,4.0


# Feature Selection 

In [15]:
lr=LinearRegression()
sfs=SequentialFeatureSelector(lr)
x2=pd.DataFrame(sfs.fit_transform(x1,y),columns=sfs.get_feature_names_out())
x2

Unnamed: 0,num_pipe__Engine Size(L),num_pipe__Fuel Consumption City (L/100 km),num_pipe__Fuel Consumption Hwy (L/100 km),num_pipe__Fuel Consumption Comb (mpg),cat_pipe__Fuel Type
0,0.146667,0.215909,0.162651,0.379310,4.0
1,0.200000,0.265152,0.222892,0.310345,4.0
2,0.080000,0.068182,0.108434,0.637931,4.0
3,0.346667,0.321970,0.307229,0.241379,4.0
4,0.346667,0.299242,0.283133,0.275862,4.0
...,...,...,...,...,...
7380,0.146667,0.246212,0.222892,0.327586,4.0
7381,0.146667,0.265152,0.259036,0.310345,4.0
7382,0.146667,0.284091,0.277108,0.275862,4.0
7383,0.146667,0.265152,0.259036,0.310345,4.0


# Train Test Split

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x2,y,test_size=0.2,random_state=23)

# Model 

In [17]:
linear_reg=LinearRegression()
linear_reg.fit(x_train,y_train)

## Training Data Evaluation 

In [18]:
y_pred_train=linear_reg.predict(x_train)

In [19]:
def Regressions_evaluate(actual_value,predicted_value):
    mse=round(mean_squared_error(actual_value,predicted_value),3)
    rmse=round(np.sqrt(mse),3)
    mae=round(mean_absolute_error(actual_value,predicted_value),3)
    r2=round(r2_score(actual_value,predicted_value),3)

    print('Regression Error Analysis')
    print('*'*50)
    print('MSE:',mse)
    print('MAE:',mae)
    print('RMSE:',rmse)
    print('R2 Score:',r2)
    print('*'*50)

In [20]:
Regressions_evaluate(y_train,y_pred_train)

Regression Error Analysis
**************************************************
MSE: 314.359
MAE: 11.381
RMSE: 17.73
R2 Score: 0.908
**************************************************


## Testing Data Evaluation

In [21]:
y_pred_test=linear_reg.predict(x_test)

In [22]:
Regressions_evaluate(y_test,y_pred_test)

Regression Error Analysis
**************************************************
MSE: 289.658
MAE: 11.149
RMSE: 17.019
R2 Score: 0.916
**************************************************


# Testing Data Prediction 

In [26]:
def Predict_emission(path,pipe,fs,model):
    data=pd.read_csv(path)
    x=pd.DataFrame(pipe.transform(data),columns=pipe.get_feature_names_out())
    x1=pd.DataFrame(fs.transform(x),columns=fs.get_feature_names_out())
    pred=model.predict(x1)
    return data,pred

In [27]:
df2,predictions=Predict_emission('data/co2.csv',preprocessor,sfs,linear_reg)

In [28]:
predictions

array([206.41322889, 231.4960662 , 143.45075452, ..., 238.79239356,
       230.22239373, 243.62476644], shape=(7385,))

In [29]:
df2

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,9.4,30,219
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,9.9,29,232
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,10.3,27,240
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,9.9,29,232


# Saving Results to csv

In [30]:
Result=df2[['Model']]
Result

Unnamed: 0,Model
0,ILX
1,ILX
2,ILX HYBRID
3,MDX 4WD
4,RDX AWD
...,...
7380,XC40 T5 AWD
7381,XC60 T5 AWD
7382,XC60 T6 AWD
7383,XC90 T5 AWD


In [31]:
Result['Prediction']=predictions
Result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Result['Prediction']=predictions


Unnamed: 0,Model,Prediction
0,ILX,206.413229
1,ILX,231.496066
2,ILX HYBRID,143.450755
3,MDX 4WD,267.697005
4,RDX AWD,258.227417
...,...,...
7380,XC40 T5 AWD,222.554022
7381,XC60 T5 AWD,230.222394
7382,XC60 T6 AWD,238.792394
7383,XC90 T5 AWD,230.222394


In [32]:
Result.to_csv('Prediction.csv')