<a href="https://colab.research.google.com/github/chxnmay17/SPPU-ML-Laboratory/blob/main/pract1subhe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn import metrics as me
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression as lr

# Load and clean data
data = pd.read_csv("uber.csv")
data = data.drop(['Unnamed: 0', 'key'], axis=1)

# Convert to datetime and drop nulls
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')
data = data.dropna()

# Feature engineering (removed 'second' and 'minute')
data = data.assign(
    hour = data.pickup_datetime.dt.hour,
    day = data.pickup_datetime.dt.day, # Added day as in original PDF
    month = data.pickup_datetime.dt.month,
    year = data.pickup_datetime.dt.year,
    dayofweek = data.pickup_datetime.dt.dayofweek
)

# Haversine distance function (FIXED: changed * 2 to ** 2)
def distance_transform(long1, lat1, long2, lat2):
    longi1, lati1, longi2, lati2 = map(np.radians, [long1, lat1, long2, lat2])
    distance_long = longi2 - longi1
    distance_lati = lati2 - lati1
    a = np.sin(distance_lati / 2) ** 2 + np.cos(lati1) * np.cos(lati2) * np.sin(distance_long / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a)) * 6371
    return c

data['Distance'] = distance_transform(data['pickup_longitude'], data['pickup_latitude'], data['dropoff_longitude'], data['dropoff_latitude'])

# Visualize outliers
plt.figure(figsize=(20, 12))
sns.boxplot(data=data)
# plt.show() # Uncomment this in your environment to see the plot

# Outlier removal
data.drop(data[data['Distance'] >= 60].index, inplace=True)
data.drop(data[data['fare_amount'] <= 0].index, inplace=True)
data.drop(data[(data['fare_amount'] > 100) & (data['Distance'] < 1)].index, inplace=True)
data.drop(data[(data['fare_amount'] < 100) & (data['Distance'] > 100)].index, inplace=True)

# Plot cleaned-up data
plt.figure(figsize=(10, 6)) # Added figure size for clarity
plt.scatter(data['Distance'], data['fare_amount'])
plt.xlabel("Distance")
plt.ylabel("Fare Amount")
# plt.show() # Uncomment this in your environment to see the plot

# Correlation heatmap
corr = data.select_dtypes(include='number').corr()
plt.figure(figsize=(12, 10)) # Increased size for readability
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
# plt.show() # Uncomment this in your environment to see the plot

# === Define Features (X) and Target (Y) ===
# This was missing from your script
X = data[['hour', 'day', 'month', 'year', 'dayofweek', 'Distance']]
Y = data[['fare_amount']] # Using [['...']] to keep it as a 2D DataFrame

# === Scaling ===
# Use separate scalers for X and y
std_x = StandardScaler()
std_y = StandardScaler()
x_std = std_x.fit_transform(X)
y_std = std_y.fit_transform(Y)

# Train-test split
x_train, x_test, y_train, y_test = tts(x_std, y_std, test_size=0.2, random_state=0)

# === 1. Linear Regression ===
l_regression = lr()
l_regression.fit(x_train, y_train)
y_pred = l_regression.predict(x_test)

print("--- Linear Regression ---")
print("Training Set Score = {:.4f}".format(l_regression.score(x_train, y_train)))
print("Test Set Score = {:.4f}".format(l_regression.score(x_test, y_test)))

# === 2. Random Forest Regression ===
rf_regression = rf(n_estimators=100, random_state=10)
# Use .ravel() on y_train to avoid DataConversionWarning
rf_regression.fit(x_train, y_train.ravel())
y_pred_rf = rf_regression.predict(x_test)

print("\n--- Random Forest ---")
# Need to calculate R2 score for RF manually
rf_r2_train = rf_regression.score(x_train, y_train)
rf_r2_test = rf_regression.score(x_test, y_test)
print(f"Training Set Score = {rf_r2_train:.4f}")
print(f"Test Set Score = {rf_r2_test:.4f}")


# === 3. MODEL COMPARISON ===
# Linear Regression metrics
r2_lr = me.r2_score(y_test, y_pred)
rmse_lr = np.sqrt(me.mean_squared_error(y_test, y_pred))

# Random Forest metrics
r2_rf = me.r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(me.mean_squared_error(y_test, y_pred_rf))

print("\n=== FINAL MODEL COMPARISON ===")
print(f"Linear Regression   ->   R2 = {r2_lr:.4f}   RMSE = {rmse_lr:.4f}")
print(f"Random Forest       ->   R2 = {r2_rf:.4f}   RMSE = {rmse_rf:.4f}")

if r2_rf > r2_lr:
    print("\nRandom Forest performed better.")
else:
    print("\nLinear Regression performed better (or equal).")

--- Linear Regression ---
Training Set Score = 0.7461
Test Set Score = 0.7447
