In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

### 1. Pre-process the dataset

In [3]:
data = pd.read_csv('UberFares.csv')  # Load the dataset
data.dropna(inplace=True)  # Handle missing data

In [7]:
X = data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]
y = data['fare_amount']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2. Identify Outliers (you can add outlier handling techniques here)

### 3. Check the Correlation

In [9]:
correlation_matrix = data.corr()
print("Correlation Matrix:\n", correlation_matrix)

Correlation Matrix:
                    Unnamed: 0  fare_amount  pickup_longitude  pickup_latitude  \
Unnamed: 0           1.000000     0.000587          0.000230        -0.000341   
fare_amount          0.000587     1.000000          0.010458        -0.008482   
pickup_longitude     0.000230     0.010458          1.000000        -0.816461   
pickup_latitude     -0.000341    -0.008482         -0.816461         1.000000   
dropoff_longitude    0.000270     0.008986          0.833026        -0.774787   
dropoff_latitude     0.000271    -0.011014         -0.846324         0.702367   
passenger_count      0.002259     0.010158         -0.000415        -0.001559   

                   dropoff_longitude  dropoff_latitude  passenger_count  
Unnamed: 0                  0.000270          0.000271         0.002259  
fare_amount                 0.008986         -0.011014         0.010158  
pickup_longitude            0.833026         -0.846324        -0.000415  
pickup_latitude            -0.7747

### 4. Implement Linear Regression and Ridge/Lasso Regression Models

In [21]:
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)

### Create a pipeline that scales the input features and transforms the target variable

In [13]:
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', lr)  # Apply Linear Regression
])

In [14]:
pipeline_ridge = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', ridge)  # Apply Ridge Regression
])

In [15]:
pipeline_lasso = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', lasso)  # Apply Lasso Regression
])

### Fit the models

In [16]:
pipeline_lr.fit(X_train, y_train)
pipeline_ridge.fit(X_train, y_train)
pipeline_lasso.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Lasso())])

### 5. Evaluate the Models

In [17]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    return r2, rmse, mae

In [18]:
r2_lr, rmse_lr, mae_lr = evaluate_model(pipeline_lr, X_test, y_test)
r2_ridge, rmse_ridge, mae_ridge = evaluate_model(pipeline_ridge, X_test, y_test)
r2_lasso, rmse_lasso, mae_lasso = evaluate_model(pipeline_lasso, X_test, y_test)

In [20]:
print("Linear Regression - R2:", r2_lr, "RMSE:", rmse_lr, "MAE:", mae_lr)
print("Ridge Regression - R2:", r2_ridge, "RMSE:", rmse_ridge, "MAE:", mae_ridge)
print("Lasso Regression - R2:", r2_lasso, "RMSE:", rmse_lasso, "MAE:", mae_lasso)

Linear Regression - R2: 0.00034152697863043535 RMSE: 10.197470623964248 MAE: 6.068508583048691
Ridge Regression - R2: 0.0003415255557153163 RMSE: 10.197470631221794 MAE: 6.068508571062477
Lasso Regression - R2: -1.2270548227721889e-05 RMSE: 10.199275000569145 MAE: 6.069560575923911
