In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
cars = pd.read_csv('car_data.csv')
cars.head()

Unnamed: 0,speed,car_age,experience,risk
0,200,15,5.0,85
1,90,17,13.0,20
2,165,12,4.0,93
3,110,20,,60
4,140,5,3.0,82


In [3]:
cars.isnull().sum()

speed         0
car_age       0
experience    1
risk          0
dtype: int64

In [4]:
cars.experience

0     5.0
1    13.0
2     4.0
3     NaN
4     3.0
5     8.0
Name: experience, dtype: float64

In [5]:
fill_exp = cars.experience.median()
fill_exp

5.0

In [6]:
cars.experience = cars.experience.fillna(fill_exp)

In [7]:
cars.experience

0     5.0
1    13.0
2     4.0
3     5.0
4     3.0
5     8.0
Name: experience, dtype: float64

In [8]:
cars

Unnamed: 0,speed,car_age,experience,risk
0,200,15,5.0,85
1,90,17,13.0,20
2,165,12,4.0,93
3,110,20,5.0,60
4,140,5,3.0,82
5,115,2,8.0,10


In [9]:
reg = LinearRegression()

In [10]:
reg.fit(cars[['speed', 'car_age', 'experience']], cars.risk)

In [11]:
reg.predict([[160,10,5]])



array([71.37146872])

In [12]:
reg.coef_

array([ 0.33059217,  1.61053246, -6.20772074])

In [13]:
reg.intercept_

33.41000091043592

In [14]:
0.33059217 * 160 + 1.61053246*10 + (-6.20772074 * 5) + 33.41000091043592

71.37146901043593

In [15]:
online_df = pd.read_csv("online.csv")
online_df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [16]:
online_df.shape

(50, 5)

In [17]:
online_df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [18]:
# handle null value, if has
# mean = online_df.Administration.mean()
# online_df.Administration = online_df.Administration.fillna(mean)

In [19]:
# seprate x and y
x = online_df.drop(['Profit'], axis=1)
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


In [20]:
y = online_df['Profit']
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [21]:
# one hot encoding
cities = pd.get_dummies(x['Area'], drop_first=True)
cities

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1
5,1,0
6,0,0
7,0,1
8,1,0
9,0,0


In [22]:
x = x.drop('Area', axis=1)

In [23]:
#Concat two data frame
x = pd.concat([x,cities], axis=1)
x

Unnamed: 0,Marketing Spend,Administration,Transport,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,1,0
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,1,0
4,142107.34,91391.77,366168.42,0,1
5,131876.9,99814.71,362861.36,1,0
6,134615.46,147198.87,127716.82,0,0
7,130298.13,145530.06,323876.68,0,1
8,120542.52,148718.95,311613.29,1,0
9,123334.88,108679.17,304981.62,0,0


In [24]:
# split dataset into train and test
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=.25,random_state=0)

# create linear reg object
reg  = LinearRegression()

In [25]:
reg.fit(xtrain,ytrain)

In [26]:
ytest

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
33     96778.92
35     96479.51
26    105733.54
Name: Profit, dtype: float64

In [27]:
# predict
pred = reg.predict(xtest)
pred

array([103501.0825284 , 128011.28068627, 126695.43891127,  70573.91718775,
       173381.96874259, 124238.07860872,  69298.09250304,  98399.41936876,
       116419.1480864 , 161430.98134847,  94740.73303076,  89920.22800514,
       105956.86065332])

In [29]:
reg.score(xtest, ytest)

0.8840978623923472

In [28]:
r2_score(ytest,pred)

0.8840978623923472