In [24]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [25]:
df = pd.read_csv("50_Startups.csv")
df.shape

(50, 5)

In [26]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [27]:
# Check for missing values
print(df.isnull().sum())


R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64


In [28]:
# Encode the 'State' column to numeric
label_encoder = LabelEncoder()
df['State'] = label_encoder.fit_transform(df['State'])
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [29]:
# One-Hot Encoding for 'State'
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
df = ct.fit_transform(df)


In [30]:
# Split the data into features (X) and target (y)
X = df[:, :-1]  # All columns except the last one
y = df[:, -1]   # The last column (Profit)


In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [33]:
# Make predictions
y_pred = model.predict(X_test)

In [36]:
# Calculate R² for this model
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²) value for the Linear Regression model: {r2:.2f}")


R-squared (R²) value for the Linear Regression model: 0.90
