# Import lib

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Get dataset

In [None]:
data = pd.read_csv('50_Startups.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# Find unique varibles in categorical feature

In [None]:
data['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

# Use ColumnTransformer

In [None]:
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder='passthrough')

In [None]:
state = np.array(columnTransformer.fit_transform(data), dtype= np.str)
column = ['California', 'Florida', 'New York', 'R&D Spend', 'Administration', 'Marketing Spend', 'Profit']
df = pd.DataFrame(state)
df.columns = column
df.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0.0,0.0,1.0,165349.2,136897.8,471784.1,192261.83
1,1.0,0.0,0.0,162597.7,151377.59,443898.53,191792.06
2,0.0,1.0,0.0,153441.51,101145.55,407934.54,191050.39
3,0.0,0.0,1.0,144372.41,118671.85,383199.62,182901.99
4,0.0,1.0,0.0,142107.34,91391.77,366168.42,166187.94


In [None]:
X = df.iloc[:, :6]
y = df.iloc[:, -1]

# Model predict with KFold=5

In [None]:
cv = KFold(n_splits=5)

In [None]:
linear = LinearRegression()

In [None]:
print('R2 score: ', cross_val_score(linear, X, y, scoring='r2', cv=5))
print('MSE: ', cross_val_score(linear, X, y, scoring='neg_mean_squared_error', cv=5))

R2 score:  [ 0.8890891  -1.00022611 -0.78958092 -0.76036588  0.38538399]
MSE:  [3.19757405e+07 1.31773822e+08 3.80623882e+07 7.47211906e+07
 2.39539562e+08]
