In [143]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [144]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [145]:
data_url = 'https://raw.githubusercontent.com/digipodium/Datasets/intro/50_Startups.csv'
df = pd.read_csv(data_url)

In [146]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [147]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [148]:
X = df.iloc[:,:4]
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [149]:
y = df['Profit']

In [150]:
X.State.nunique()

3

In [151]:
state_enc = OneHotEncoder(drop='first')
dummy_states = state_enc.fit_transform(X[['State']]).toarray()

In [152]:
X = X.drop(columns=['State'])

In [153]:
X = pd.concat([X,pd.DataFrame(dummy_states)],axis=1)

In [154]:
scaler = StandardScaler()
X =  scaler.fit_transform(X)

In [167]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=.2,random_state=1)

In [168]:
model = LinearRegression()
model.fit(xtrain, ytrain)

LinearRegression()

In [169]:
ypred =  model.predict(xtest)

In [173]:
import math
math.sqrt(mean_squared_error(ytest, ypred))

8916.021618643037

In [171]:
mean_absolute_error(ytest, ypred)

7698.119817484754

In [172]:
model.score(xtest,ytest) * 100

96.49618042060467

In [175]:
from joblib import dump

In [176]:
dump({
    'state_hot_encoder': state_enc,
    'scaler': scaler,
    'model': model
},'startup_model_v1.jb')

['startup_model_v1.jb']

In [177]:
import numpy as np

In [179]:
admin = 192302
rnd = 920310
mkt = 910322


# make it in 2d numpy array
inp = np.array([[admin,rnd,mkt]])
inp

array([[192302, 920310, 910322]])

In [185]:
state = 'Florida'
inp_d = state_enc.transform([[state]]).toarray()

In [188]:
inp_f = np.hstack([inp,inp_d])

In [189]:
inp_f = scaler.transform(inp_f)

In [191]:
model.predict(inp_f)

array([216673.82588915])