In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
audi = pd.read_csv("audi_a1_dataset.csv")

In [3]:
df = audi.copy()

In [4]:
df.head()

Unnamed: 0,index,Year,Type,Mileage(miles),Engine,PS,Transmission,Fuel,Number_of_Owners,Price(£),href,PPY,MileageRank,PriceRank,PPYRank,Score
0,0,2018.0,Hatchback,44000.0,1.6L,114.398422,Manual,Diesel,1,14995.0,https://www.autotrader.co.uk/car-details/20221...,2499.166667,215,163,340,718
1,4,2016.0,Hatchback,42596.0,1.0L,93.688363,Manual,Petrol,3,10755.0,https://www.autotrader.co.uk/car-details/20221...,2688.75,222,330,276,828
2,7,2015.0,Hatchback,42700.0,1.4L,123.274162,Manual,Petrol,2,10799.0,https://www.autotrader.co.uk/car-details/20221...,3599.666667,221,327,94,642
3,11,2014.0,Hatchback,86000.0,1.6L,103.550296,Manual,Diesel,3,7490.0,https://www.autotrader.co.uk/car-details/20221...,3745.0,41,449,83,573
4,12,2014.0,Hatchback,104310.0,1.6L,103.550296,Manual,Diesel,3,7400.0,https://www.autotrader.co.uk/car-details/20220...,3700.0,12,452,85,549


In [5]:
df = df.drop(columns=["index", "href", "MileageRank", "PriceRank", "PPYRank", "Score", "PPY"])

In [6]:
df.head()

Unnamed: 0,Year,Type,Mileage(miles),Engine,PS,Transmission,Fuel,Number_of_Owners,Price(£)
0,2018.0,Hatchback,44000.0,1.6L,114.398422,Manual,Diesel,1,14995.0
1,2016.0,Hatchback,42596.0,1.0L,93.688363,Manual,Petrol,3,10755.0
2,2015.0,Hatchback,42700.0,1.4L,123.274162,Manual,Petrol,2,10799.0
3,2014.0,Hatchback,86000.0,1.6L,103.550296,Manual,Diesel,3,7490.0
4,2014.0,Hatchback,104310.0,1.6L,103.550296,Manual,Diesel,3,7400.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              471 non-null    float64
 1   Type              471 non-null    object 
 2   Mileage(miles)    471 non-null    float64
 3   Engine            471 non-null    object 
 4   PS                471 non-null    float64
 5   Transmission      471 non-null    object 
 6   Fuel              471 non-null    object 
 7   Number_of_Owners  471 non-null    int64  
 8   Price(£)          471 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 33.2+ KB


In [8]:
df["Engine"] = df["Engine"].str.replace("L", "")

In [9]:
df["Engine"]

0      1.6
1      1.0
2      1.4
3      1.6
4      1.6
      ... 
466    1.4
467    1.0
468    1.4
469    1.0
470    1.0
Name: Engine, Length: 471, dtype: object

In [10]:
df.dtypes

Year                float64
Type                 object
Mileage(miles)      float64
Engine               object
PS                  float64
Transmission         object
Fuel                 object
Number_of_Owners      int64
Price(£)            float64
dtype: object

In [11]:
df["Engine"] = pd.to_numeric(df["Engine"])

In [12]:
df.dtypes

Year                float64
Type                 object
Mileage(miles)      float64
Engine              float64
PS                  float64
Transmission         object
Fuel                 object
Number_of_Owners      int64
Price(£)            float64
dtype: object

In [13]:
df.head()

Unnamed: 0,Year,Type,Mileage(miles),Engine,PS,Transmission,Fuel,Number_of_Owners,Price(£)
0,2018.0,Hatchback,44000.0,1.6,114.398422,Manual,Diesel,1,14995.0
1,2016.0,Hatchback,42596.0,1.0,93.688363,Manual,Petrol,3,10755.0
2,2015.0,Hatchback,42700.0,1.4,123.274162,Manual,Petrol,2,10799.0
3,2014.0,Hatchback,86000.0,1.6,103.550296,Manual,Diesel,3,7490.0
4,2014.0,Hatchback,104310.0,1.6,103.550296,Manual,Diesel,3,7400.0


In [14]:
df["Type"].unique()

array(['Hatchback'], dtype=object)

In [15]:
df["Transmission"].unique()

array(['Manual', 'Automatic'], dtype=object)

In [16]:
df["Fuel"].unique()

array(['Diesel', 'Petrol'], dtype=object)

In [17]:
df = pd.get_dummies(df, columns=["Type", "Transmission", "Fuel"], drop_first=True)

In [18]:
df.head()

Unnamed: 0,Year,Mileage(miles),Engine,PS,Number_of_Owners,Price(£),Transmission_Manual,Fuel_Petrol
0,2018.0,44000.0,1.6,114.398422,1,14995.0,True,False
1,2016.0,42596.0,1.0,93.688363,3,10755.0,True,True
2,2015.0,42700.0,1.4,123.274162,2,10799.0,True,True
3,2014.0,86000.0,1.6,103.550296,3,7490.0,True,False
4,2014.0,104310.0,1.6,103.550296,3,7400.0,True,False


In [19]:
y = df[["Price(£)"]]
x = df.drop("Price(£)", axis=1)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70, random_state=10)

In [21]:
lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [22]:
model.score(x_test, y_test)

0.9095480135237141

In [23]:
df.head()

Unnamed: 0,Year,Mileage(miles),Engine,PS,Number_of_Owners,Price(£),Transmission_Manual,Fuel_Petrol
0,2018.0,44000.0,1.6,114.398422,1,14995.0,True,False
1,2016.0,42596.0,1.0,93.688363,3,10755.0,True,True
2,2015.0,42700.0,1.4,123.274162,2,10799.0,True,True
3,2014.0,86000.0,1.6,103.550296,3,7490.0,True,False
4,2014.0,104310.0,1.6,103.550296,3,7400.0,True,False


In [24]:
model.predict([[2017.0, 40000.0, 1.6, 120, 1, 1, 0]])



array([[14404.1719514]])