In [50]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [51]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1:]

In [52]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [53]:
state_dummies = pd.get_dummies(X["State"], prefix = "State")

In [54]:
X = pd.concat([X, state_dummies], axis=1)

In [55]:
X.drop(columns="State", inplace=True)

In [56]:
X.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [57]:
# Spliting data into train and test

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=0)

In [60]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (40, 6)
y_train shape: (40, 1)


In [61]:
# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression

# to build the model
from sklearn.linear_model import Lasso
# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

In [62]:
# 1 ) Linear Regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [63]:
lin_pred_1 = regressor.predict(X_train)

In [64]:
profit_median = y_train["Profit"].median()
profit_median

107978.19

In [65]:
print('Average house price: ', {profit_median})
print(f"train rmse: {mean_squared_error(y_train, lin_pred_1, squared=False)}")

Average house price:  {107978.19}
train rmse: 9031.666612578969


In [66]:
lin_pred_2 = regressor.predict(X_test)

In [67]:
print('Average house price: ', {profit_median})
print(f"train rmse: {mean_squared_error(y_test, lin_pred_2, squared=False)}")

Average house price:  {107978.19}
train rmse: 9137.990152794948


In [68]:
# 2 ) Lasso Regression
lasso_model = Lasso(alpha=0.001, random_state=0)

In [69]:
# train the model

lasso_model.fit(X_train, y_train)

In [70]:
lasso_model_prediction_1 = lasso_model.predict(X_train)

In [71]:
print('Average house price: ', {profit_median})
print(f"train rmse: {mean_squared_error(y_train, lasso_model_prediction_1, squared=False)}")

Average house price:  {107978.19}
train rmse: 9031.666612579278


In [72]:
lasso_model_prediction_2 = lasso_model.predict(X_test)

In [73]:
print('Average house price: ', {profit_median})
print(f"train rmse: {mean_squared_error(y_test, lasso_model_prediction_2, squared=False)}")

Average house price:  {107978.19}
train rmse: 9137.98878292775


In [74]:
#picted lasso model
lasso_model

In [75]:
coefs = lasso_model.coef_
lasso_model.coef_

array([ 7.73e-01,  3.29e-02,  3.66e-02, -1.93e+02, -1.15e+03,  5.07e+02])

In [76]:
coef_df = pd.DataFrame({"coefficients":coefs}, index = X.columns)
coef_df

Unnamed: 0,coefficients
R&D Spend,0.773467
Administration,0.032885
Marketing Spend,0.03661
State_California,-192.767747
State_Florida,-1152.049426
State_New York,506.59755


In [77]:
print(lasso_model.coef_)
print(lasso_model.intercept_)

[ 7.73e-01  3.29e-02  3.66e-02 -1.93e+02 -1.15e+03  5.07e+02]
[42746.94]
