In [54]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
%matplotlib inline

In [55]:
data = pd.read_csv("petrol.csv", 
                   names=['Index','petrol_tax', 'Average_income', 'Paved_Highways', 'Population_Driver_license', 'Petrol_Consumption'])

In [56]:
data.columns

Index(['Index', 'petrol_tax', 'Average_income', 'Paved_Highways',
       'Population_Driver_license', 'Petrol_Consumption'],
      dtype='object')

In [57]:
data.head(10)

Unnamed: 0,Index,petrol_tax,Average_income,Paved_Highways,Population_Driver_license,Petrol_Consumption
1,1,9.0,3571,1976,0.525,541
2,1,9.0,4092,1250,0.572,524
3,1,9.0,3865,1586,0.58,561
4,1,7.5,4870,2351,0.529,414
5,1,8.0,4399,431,0.544,410
6,1,10.0,5342,1333,0.571,457
7,1,8.0,5319,11868,0.451,344
8,1,8.0,5126,2138,0.553,467
9,1,8.0,4447,8577,0.529,464
10,1,7.0,4512,8507,0.552,498


In [60]:
data=data.drop(data.columns[0], axis=1)

In [61]:
data.head(5)

Unnamed: 0,petrol_tax,Average_income,Paved_Highways,Population_Driver_license,Petrol_Consumption
1,9.0,3571,1976,0.525,541
2,9.0,4092,1250,0.572,524
3,9.0,3865,1586,0.58,561
4,7.5,4870,2351,0.529,414
5,8.0,4399,431,0.544,410


In [40]:
data.describe()  

Unnamed: 0,petrol_tax,Average_income,Paved_Highways,Population_Driver_license,Petrol_Consumption
count,48.0,48.0,48.0,48.0,48.0
mean,7.668333,4241.833333,5565.416667,0.570333,576.770833
std,0.95077,573.623768,3491.507166,0.05547,111.885816
min,5.0,3063.0,431.0,0.451,344.0
25%,7.0,3739.0,3110.25,0.52975,509.5
50%,7.5,4298.0,4735.5,0.5645,568.5
75%,8.125,4578.75,7156.0,0.59525,632.75
max,10.0,5342.0,17782.0,0.724,968.0


In [41]:
X = data[['petrol_tax', 'Average_income', 'Paved_Highways',  
       'Population_Driver_license']]
y = data['Petrol_Consumption']  

In [42]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [43]:
from sklearn.linear_model import LinearRegression  
regressor = LinearRegression()  
regressor.fit(X_train, y_train) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [44]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
coeff_df 

Unnamed: 0,Coefficient
petrol_tax,-40.01666
Average_income,-0.065413
Paved_Highways,-0.004741
Population_Driver_license,1341.862121


In [45]:
#This means that for a unit increase in "petroltax", there is a decrease of 24.19 million gallons in gas consumption. 
#Similarly, a unit increase in proportion of population with a drivers license results in an increase of 1.324 billion gallons of gas consumption. 
#We can see that "Averageincome" and "Paved_Highways" have a very little effect on the gas consumption.

In [46]:
y_pred = regressor.predict(X_test)  

In [47]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})  
df  

Unnamed: 0,Actual,Predicted
30,534,469.391989
5,410,545.645464
27,577,589.668394
31,571,569.730413
33,577,649.774809
38,704,646.631164
35,487,511.608148
41,587,672.475177
8,467,502.074782
11,580,501.270734


In [48]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 56.822247478964684
Mean Squared Error: 4666.3447875883585
Root Mean Squared Error: 68.31064915215165


In [49]:
metrics.r2_score(y_test, y_pred) 

0.2036193241012182

You can see that the value of root mean squared error is 60.07, which is slightly greater than 10% of the mean value of the gas consumption in all states. This means that our algorithm was not very accurate but can still make reasonably good predictions.

There are many factors that may have contributed to this inaccuracy, a few of which are listed here:

    Need more data: Only one year worth of data isn't that much, whereas having multiple years worth could have helped us improve the accuracy quite a bit.
    Bad assumptions: We made the assumption that this data has a linear relationship, but that might not be the case. Visualizing the data may help you determine that.
    Poor features: The features we used may not have had a high enough correlation to the values we were trying to predict.
