# AUTOMPG LINEAR REGRESSION

In [None]:
#importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

## 1. Convert the data from csv file into dataframe

In [None]:
data=pd.read_csv("/content/auto-mpg.csv")
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


## 2. Display shape


In [None]:
data.shape

(398, 9)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


## 3. Display the car names and their count

In [None]:
data['car name'].value_counts()

ford pinto                    6
amc matador                   5
ford maverick                 5
toyota corolla                5
amc hornet                    4
                             ..
amc hornet sportabout (sw)    1
mercury marquis               1
mazda glc deluxe              1
vw rabbit c (diesel)          1
mercury monarch ghia          1
Name: car name, Length: 305, dtype: int64

## 4. Identify the missing value attributes

In [None]:
data['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [None]:
data['horsepower'] = data['horsepower'].replace('?', np.NaN).astype(np.float)
data['horsepower'].dtype
data['horsepower'].isnull().sum()

6

In [None]:
data['horsepower'].fillna(data['horsepower'].mean(), inplace = True)
data['horsepower'].unique()

array([130.        , 165.        , 150.        , 140.        ,
       198.        , 220.        , 215.        , 225.        ,
       190.        , 170.        , 160.        ,  95.        ,
        97.        ,  85.        ,  88.        ,  46.        ,
        87.        ,  90.        , 113.        , 200.        ,
       210.        , 193.        , 104.46938776, 100.        ,
       105.        , 175.        , 153.        , 180.        ,
       110.        ,  72.        ,  86.        ,  70.        ,
        76.        ,  65.        ,  69.        ,  60.        ,
        80.        ,  54.        , 208.        , 155.        ,
       112.        ,  92.        , 145.        , 137.        ,
       158.        , 167.        ,  94.        , 107.        ,
       230.        ,  49.        ,  75.        ,  91.        ,
       122.        ,  67.        ,  83.        ,  78.        ,
        52.        ,  61.        ,  93.        , 148.        ,
       129.        ,  96.        ,  71.        ,  98.  

In [None]:
data.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
data['car name']=label.fit_transform(data["car name"])

## 5. Split the dataset as 80:20 for training and testing

In [None]:
y=data.pop("mpg")
x=data

In [None]:
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.2)

## 6. Build a linear regression model, predict the output for test set and calculate the error.

In [None]:
model=LinearRegression()

In [None]:
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
y_pred=model.predict(x_test)
rmse=mean_squared_error(y_pred,y_test)
mae=mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error=", mae)
print("Root Mean Squared Error=", rmse)

Mean Absolute Error= 2.4825338841082907
Root Mean Squared Error= 10.130646920348129
