In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Load data
data_train = pd.read_csv('fare_taxi_train.csv')
data_test = pd.read_csv('fare_taxi_test.csv')

# Preprocess datetime column
data_train['pickup_datetime'] = pd.to_datetime(data_train['pickup_datetime'])
data_train['pickup_datetime'] = data_train['pickup_datetime'].astype('int64') // 10**9
data_test['pickup_datetime'] = pd.to_datetime(data_test['pickup_datetime'])
data_test['pickup_datetime'] = data_test['pickup_datetime'].astype('int64') // 10**9

# Define features and target
x = data_train.drop(columns=['fare_amount'])  # Assuming 'fare_amount' is the target
y = data_train['fare_amount']

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_data = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'pickup_datetime']
categorical_data = ['key']

# Ensure that only the necessary columns are selected for training
x_train = x_train[numeric_data]
x_test = x_test[numeric_data]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

# Initialize and fit the model
rfr = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=10, n_estimators=100)
rfr.fit(x_train, y_train)

# Predict on the test set
y_pred = rfr.predict(x_test)

print(y_pred)

# Assuming new_data is a dictionary containing the input features for prediction
new_data = {
    'pickup_longitude': -73.987,
    'pickup_latitude': 40.748,
    'dropoff_longitude': -73.990,
    'dropoff_latitude': 40.752,
    'passenger_count': 2,
    'pickup_datetime': 1609459200  # Unix timestamp for January 1, 2023, 12:00:00 AM (UTC)
}
# Convert the dictionary to a DataFrame
new_df = pd.DataFrame([new_data])

# Handle missing values (if any)
new_df = imputer.transform(new_df)

# Make prediction
prediction = rfr.predict(new_df)

print("Predicted fare amount:", prediction)


[ 7.760515    7.760515   35.4160308  ...  9.28206644 47.80711434
  9.28206644]
Predicted fare amount: [9.28206644]
