In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('50_Startups.csv')

In [3]:
print(data.head())


   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [4]:
X = data.drop('Profit', axis=1)
y = data['Profit']


In [5]:
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), ['State'])
    ],
    remainder='passthrough'
)
X_encoded = ct.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [7]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor()
}

In [8]:
r2_values = {}

In [9]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    r2_values[model_name] = r2


In [10]:
table_data = {'Model': list(r2_values.keys()), 'R^2 Value': list(r2_values.values())}
r2_table = pd.DataFrame(table_data)


In [11]:
print("\nR-squared values for each model:")
print(r2_table)


R-squared values for each model:
               Model  R^2 Value
0  Linear Regression   0.898727
1      Random Forest   0.886782


In [12]:
predictions={}

In [13]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[model_name] = y_pred

    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} R^2 Value: {r2}")


Linear Regression R^2 Value: 0.8987266414319495
Random Forest R^2 Value: 0.9170838424360246


In [14]:
for model_name, y_pred in predictions.items():
    print(f"\n{model_name} Predicted Values:")
    predicted_values = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(predicted_values)


Linear Regression Predicted Values:
       Actual      Predicted
13  134307.35  126362.879083
39   81005.76   84608.453836
30   99937.59   99677.494252
45   64926.08   46357.460686
17  125370.37  128750.482885
48   35673.41   50912.417419
26  105733.54  109741.350327
25  107404.34  100643.242816
32   97427.84   97599.275746
19  122776.86  113097.425244

Random Forest Predicted Values:
       Actual    Predicted
13  134307.35  133492.5495
39   81005.76   88025.1050
30   99937.59   99423.9868
45   64926.08   43673.9432
17  125370.37  130860.4255
48   35673.41   41303.4978
26  105733.54  113138.3775
25  107404.34  103859.1606
32   97427.84   99222.4320
19  122776.86  128870.8888
