In [None]:
# basic import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# scikit-learn import
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # regression metrics

In [None]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/50_Startups.csv'
df = pd.read_csv(url)

In [None]:
df.isnull().sum()

In [None]:
df.info()

check the dependency of profit on numerical columns
1. correlation graph
    - R&D Spend vs Profit
    - Administration vs Profit
    - Marketing Spend vs Profit
2. correlation matrix
    - R&D Spend
    - Administration
    - Marketing Spend
    - Profit

In [None]:
import seaborn as sns

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(15, 5))
sns.regplot(data=df, x='R&D Spend', y='Profit', color='b', ax=axs[0])
sns.regplot(data=df, x='Administration', y='Profit', color='g', ax=axs[1])
sns.regplot(data=df, x='Marketing Spend', y='Profit', color='r', ax=axs[2])
plt.show()

Rule for pearson correlation coefficient:
1. If r is greater than 0, then there is a positive linear relationship.
2. If r is less than 0, then there is a negative linear relationship.
3. If r is equal to 0, then there is no linear relationship.
4. The stronger the linear relationship the closer r is to either +1 or -1.
5. The weaker the linear relationship the closer r is to 0.
6. The Pearson correlation coefficient only measures linear relationships. It may indicate that there is a strong nonlinear relationship between two variables that is not captured by the Pearson correlation coefficient.

In [None]:
# pearson correlation coefficient
corr = df.drop(columns='State').corr()
sns.heatmap(corr, annot=True, cmap='RdBu')

correlation for categorical data
- boxplot

In [None]:
sns.swarmplot(data=df, x='State', y='Profit')

In [None]:
X = df[['Marketing Spend','R&D Spend', 'State']]
y = df['Profit']
X.head()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
enc = OneHotEncoder(drop='first', sparse_output=False)
dummy_states = enc.fit_transform(X[['State']])
X = pd.concat([X, pd.DataFrame(dummy_states, columns=['0','1'])], axis=1)
X.drop(columns=['State'], inplace=True)
X.head()

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)

In [None]:
reg = LinearRegression()
reg.fit(Xtrain, ytrain)

In [None]:
ypred = reg.predict(Xtest)
mae = mean_absolute_error(ytest, ypred)
mse = mean_squared_error(ytest, ypred)
r2 = r2_score(ytest, ypred)
print("MAE: ", mae)
print("MSE: ", mse)
print("R2: ", r2)

In [None]:
enc.transform([['Florida']])

In [None]:
x = pd.DataFrame({
    'Marketing Spend': [136897.8,2000000],
    'R&D Spend': [165349.2, 2000000],
    '0': [0, 0],
    '1': [1, 0],
})
x

In [None]:
reg.predict(x)