In [15]:
import pandas as pd
import numpy as np
import joblib

# Load the model and transformers
xgb_model = joblib.load('xgb_model_polynomial.pkl')
encoder = joblib.load('encoder.pkl')
poly = joblib.load('poly.pkl')

# Function to calculate PCA values based on trip distance and duration
def calculate_pca_values(trip_distance, trip_duration):
    pca1 = np.log1p(trip_distance) - np.log1p(trip_duration)
    pca2 = np.log1p(trip_duration) - np.log1p(trip_distance)
    return pca1, pca2

def predict_fare(trip_distance, trip_duration, tip_amount, is_holiday, 
                 pickup_time_of_day, pickup_season, passenger_count_category, 
                 pickup_day_type, pickup_location, dropoff_location):
    # Generalize zones into categories
    general_locations = {
        "downtown": ["Midtown Center", "Midtown East", "Lower Manhattan"],
        "suburbs": ["Manhattan Valley", "East Harlem South", "Upper West Side", "Upper East Side"],
        "airport": ["JFK Airport", "LaGuardia Airport"]
    }
    
    # Map general locations to zones (example mapping)
    if pickup_location in general_locations["downtown"]:
        puzone = "Midtown Center"
    elif pickup_location in general_locations["airport"]:
        puzone = "JFK Airport"
    else:
        puzone = "Manhattan Valley"
    
    if dropoff_location in general_locations["downtown"]:
        dozone = "Midtown Center"
    elif dropoff_location in general_locations["airport"]:
        dozone = "JFK Airport"
    else:
        dozone = "Manhattan Valley"
    
    # Calculate PCA values
    pca1, pca2 = calculate_pca_values(trip_distance, trip_duration)
    
    # Prepare the input data as a DataFrame
    input_data = pd.DataFrame({
        'trip_distance': [trip_distance],
        'trip_duration': [trip_duration],
        'tip_amount': [tip_amount],
        'PCA1': [pca1],
        'PCA2': [pca2],
        'is_holiday': [is_holiday],
        'pickup_time_of_day': [pickup_time_of_day],
        'pickup_season': [pickup_season],
        'passenger_count_category': [passenger_count_category],
        'pickup_day_type': [pickup_day_type],
        'PUzone': [puzone],
        'PUborough': ["Manhattan"],  # Assuming single borough for simplicity
        'DOzone': [dozone],
        'DOborough': ["Manhattan"]  # Assuming single borough for simplicity
    })
    
    # Encode the input data
    input_encoded = encoder.transform(input_data)
    
    # Apply polynomial transformations
    input_poly = poly.transform(input_encoded)
    
    # Predict the fare
    predicted_fare = xgb_model.predict(input_poly)
    
    # Ensure the predicted fare is non-negative
    predicted_fare = max(predicted_fare[0], 0)
    
    return predicted_fare

# Example usage
trip_distance = float(input("Enter trip distance (Miles): "))
trip_duration = float(input("Enter trip duration (minutes): "))
tip_amount = float(input("Enter tip amount: "))
is_holiday = int(input("Is it a holiday? (1 for Yes, 0 for No): "))
pickup_time_of_day = input("Enter pickup time of day (morning/afternoon/evening/night): ")
pickup_season = input("Enter pickup season (winter/spring/summer/autumn): ")
passenger_count_category = input("Enter passenger count category (low/medium/high): ")
pickup_day_type = input("Enter pickup day type (weekday/weekend): ")
pickup_location = input("Enter pickup location (downtown/suburbs/airport): ")
dropoff_location = input("Enter drop-off location (downtown/suburbs/airport): ")

fare = predict_fare(trip_distance, trip_duration, tip_amount, is_holiday, 
                    pickup_time_of_day, pickup_season, passenger_count_category, 
                    pickup_day_type, pickup_location, dropoff_location)
print(f"Predicted Fare: {fare}")


Enter trip distance (km): 1
Enter trip duration (minutes): 1
Enter tip amount: 0
Is it a holiday? (1 for Yes, 0 for No): 1
Enter pickup time of day (morning/afternoon/evening/night): afternoon
Enter pickup season (winter/spring/summer/autumn): winter
Enter passenger count category (low/medium/high): low
Enter pickup day type (weekday/weekend): weekday
Enter pickup location (downtown/suburbs/airport): downtown
Enter drop-off location (downtown/suburbs/airport): downtown
Predicted Fare: 139.96099853515625


## Integration Tests

We can select from our dataset 2 or 3 actual cases and see how the model performs compared to them 

In [14]:
# Integration test function
def integration_test():
    test_cases = [
        (10, 10, 5, 1, "morning", "winter", "low", "weekday", "downtown", "suburbs"),
        (5, 15, 2, 0, "afternoon", "spring", "medium", "weekend", "airport", "downtown")
    ]
    
    for trip_distance, trip_duration, tip_amount, is_holiday, pickup_time_of_day, pickup_season, passenger_count_category, pickup_day_type, pickup_location, dropoff_location in test_cases:
        fare = predict_fare(trip_distance, trip_duration, tip_amount, is_holiday, 
                            pickup_time_of_day, pickup_season, passenger_count_category, 
                            pickup_day_type, pickup_location, dropoff_location)
        print(f"Predicted fare: {fare}")
        
integration_test()


Predicted fare: 71.61669921875
Predicted fare: 60.9279670715332


In [12]:
import unittest

class TestFarePrediction(unittest.TestCase):
    def setUp(self):
        # Load the model and transformers
        self.xgb_model = joblib.load('xgb_model_polynomial.pkl')
        self.encoder = joblib.load('encoder.pkl')
        self.poly = joblib.load('poly.pkl')
    
    def test_pca_values(self):
        pca1, pca2 = calculate_pca_values(10, 20)
        self.assertAlmostEqual(pca1, np.log1p(10) - np.log1p(20), places=5)
        self.assertAlmostEqual(pca2, np.log1p(20) - np.log1p(10), places=5)
    
    def test_predict_fare(self):
        # Define some example input values
        trip_distance = 10
        trip_duration = 20
        tip_amount = 5
        is_holiday = 1
        pickup_time_of_day = "morning"
        pickup_season = "winter"
        passenger_count_category = "low"
        pickup_day_type = "weekday"
        pickup_location = "downtown"
        dropoff_location = "suburbs"
        
        # Predict the fare
        fare = predict_fare(trip_distance, trip_duration, tip_amount, is_holiday, 
                            pickup_time_of_day, pickup_season, passenger_count_category, 
                            pickup_day_type, pickup_location, dropoff_location)
        
        # Ensure the predicted fare is non-negative
        self.assertGreaterEqual(fare, 0)

if __name__ == '__main__':
    unittest.main()


E
ERROR: /Users/md/Library/Jupyter/runtime/kernel-48acf81d-5213-4318-a5c4-adebd47e3291 (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/Users/md/Library/Jupyter/runtime/kernel-48acf81d-5213-4318-a5c4-adebd47e3291'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Example known inputs
test_inputs = [
    {"trip_distance": 10, "trip_duration": 20, "tip_amount": 5, "is_holiday": 1, 
     "pickup_time_of_day": "morning", "pickup_season": "winter", "passenger_count_category": "low", 
     "pickup_day_type": "weekday", "pickup_location": "downtown", "dropoff_location": "suburbs"},
    {"trip_distance": 15, "trip_duration": 25, "tip_amount": 2, "is_holiday": 0, 
     "pickup_time_of_day": "afternoon", "pickup_season": "summer", "passenger_count_category": "medium", 
     "pickup_day_type": "weekend", "pickup_location": "airport", "dropoff_location": "downtown"}
]

for test_input in test_inputs:
    fare = predict_fare(**test_input)
    print(f"Predicted Fare for input {test_input}: {fare}")


In [None]:
from sklearn.model_selection import cross_val_score

# Assuming X_train_poly and y_train are already defined and processed
scores = cross_val_score(xgb_model, X_train_poly, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Cross-validation MAE scores:", -scores)
print("Mean Cross-validation MAE:", -scores.mean())


In [None]:
def integration_test():
    test_cases = [
        (10, 20, 5, 1, "morning", "winter", "low", "weekday", "downtown", "suburbs"),
        (5, 15, 2, 0, "afternoon", "spring", "medium", "weekend", "airport", "downtown")
    ]
    
    for trip_distance, trip_duration, tip_amount, is_holiday, pickup_time_of_day, pickup_season, passenger_count_category, pickup_day_type, pickup_location, dropoff_location in test_cases:
        fare = predict_fare(trip_distance, trip_duration, tip_amount, is_holiday, 
                            pickup_time_of_day, pickup_season, passenger_count_category, 
                            pickup_day_type, pickup_location, dropoff_location)
        print(f"Predicted fare: {fare}")
        
integration_test()


In [None]:
python -m unittest your_test_script.py
