In [None]:
##Load and exploration of dataset

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('C:\\Users\\vaibh\\Downloads\\uber.csv')

# Display the first few rows of the dataframe
df.head()

## Data Preprocessing

## Initial Exploration

In [None]:
# Descriptive statistics
stats = df.describe()

# Passenger count distribution
passenger_counts = df['passenger_count'].value_counts()

print(stats)
print(passenger_counts)

In [None]:
df.info()

# Cleaning the dataset


In [None]:
df.dropna()

In [None]:
fare_summary = df['fare_amount'].describe()

# Visualize the distribution of fare_amount
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(df['fare_amount'], bins=50, color='blue', alpha=0.7, edgecolor='black')
plt.xlabel('Fare Amount ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Fare Amount')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Output summary and visualization
fare_summary, plt.show()

In [None]:
#Training and testing of the dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X =df.drop(columns=['fare_amount'])
y = df['fare_amount']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
categorical_features = ['day_of_week', 'time_of_day', 'distance_category']

In [None]:
missing_rows = df[df[['dropoff_longitude', 'dropoff_latitude']].isnull().any(axis=1)]

# Remove rows with missing values
uber_data_clean = df.dropna()

# Check if any missing values remain
remaining_missing = df.isnull().sum()

# Output the count of rows removed and the summary of missing values
num_removed = len(missing_rows)
remaining_missing, num_removed

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and testing (20%) sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Output the number of rows in the training and testing sets
train_size = len(train_data)
test_size = len(test_data)
train_size, test_size

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns to standardize
numerical_columns = [ 'fare_amount', 'passenger_count']

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply standardization on training data
train_data_scaled = train_data.copy()
train_data_scaled[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])

# Apply the scaler fitted to training data on the testing data
test_data_scaled = test_data.copy()
test_data_scaled[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Verify the transformation
train_data_scaled.head()

In [None]:
#correlation matrix

In [None]:
relevant_columns = ['fare_amount', 'pickup_longitude', 'pickup_latitude', 
                    'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

# Calculating the correlation matrix
correlation_matrix = df[relevant_columns].corr()

# Extracting the correlation with fare_amount
fare_correlation = correlation_matrix['fare_amount'].sort_values(ascending=False)
fare_correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
try:
    # Training the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Making predictions on the test set
    y_pred = model.predict(X_test)

    # Calculating evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    metrics = (mse, mae, r2)
except Exception as e:
    metrics = str(e)
    metrics



In [None]:
sample_data = df.sample(n=10000, random_state=1)

# Convert 'pickup_datetime' from string to datetime
sample_data['pickup_datetime'] = pd.to_datetime(sample_data['pickup_datetime'])

# Extracting hour, day of the week, and month from 'pickup_datetime'
sample_data['hour'] = sample_data['pickup_datetime'].dt.hour
sample_data['day_of_week'] = sample_data['pickup_datetime'].dt.dayofweek
sample_data['month'] = sample_data['pickup_datetime'].dt.month

# Checking for missing values and out of bounds coordinates again with the sample
missing_values_sample = sample_data.isnull().sum()
out_of_bounds_sample = sample_data[
    (sample_data['pickup_latitude'] < 40.5) | (sample_data['pickup_latitude'] > 41) |
    (sample_data['pickup_longitude'] < -74.25) | (sample_data['pickup_longitude'] > -73.75) |
    (sample_data['dropoff_latitude'] < 40.5) | (sample_data['dropoff_latitude'] > 41) |
    (sample_data['dropoff_longitude'] < -74.25) | (sample_data['dropoff_longitude'] > -73.75)
]

missing_values_sample, out_of_bounds_sample.shape

In [None]:
cleaned_data = sample_data[
    (sample_data['pickup_latitude'] >= 40.5) & (sample_data['pickup_latitude'] <= 41) &
    (sample_data['pickup_longitude'] >= -74.25) & (sample_data['pickup_longitude'] <= -73.75) &
    (sample_data['dropoff_latitude'] >= 40.5) & (sample_data['dropoff_latitude'] <= 41) &
    (sample_data['dropoff_longitude'] >= -74.25) & (sample_data['dropoff_longitude'] <= -73.75)
]

# Display the shape of the cleaned data
cleaned_data.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Preparing the feature matrix and target vector
X = cleaned_data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'hour', 'day_of_week', 'month']]
y = cleaned_data['fare_amount']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initializing and training the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Calculating evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Prepare your feature matrix and target vector
X = cleaned_data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'hour', 'day_of_week', 'month']]
y = cleaned_data['fare_amount']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}, MAE: {mae}, R^2: {r2}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Setup the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 20, 30],        # Maximum depth of trees
    'min_samples_split': [2, 5, 10]   # Minimum number of samples required to split an internal node
}

# Initialize the model
rf = RandomForestRegressor(random_state=0)

# Setup the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Fit grid search to the data
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict and evaluate using the best estimator
y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"MSE: {mse}, MAE: {mae}, R^2: {r2}")