In [1]:
import pandas as pd
import numpy as np


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ship_id             1440 non-null   object 
 1   ship_type           1440 non-null   object 
 2   route_id            1440 non-null   object 
 3   month               1440 non-null   object 
 4   distance            1440 non-null   float64
 5   fuel_type           1440 non-null   object 
 6   fuel_consumption    1440 non-null   float64
 7   CO2_emissions       1440 non-null   float64
 8   weather_conditions  1440 non-null   object 
 9   engine_efficiency   1440 non-null   float64
dtypes: float64(4), object(6)
memory usage: 112.6+ KB


In [9]:
df.nunique()

ship_id                120
ship_type                4
route_id                 4
month                   12
distance              1398
fuel_type                2
fuel_consumption      1439
CO2_emissions         1440
weather_conditions       3
engine_efficiency     1089
dtype: int64

In [2]:
!pip install scikit-learn





In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error

# --- Configuration ---
FUEL_DATA_PATH = "fuel_data.csv"  # <-- Make sure this matches your uploaded file name
MODEL_SAVE_PATH = "fuel_model_efficiency.pkl"

# --- 1. Load Data ---
try:
    df = pd.read_csv(FUEL_DATA_PATH)
    print(f"Successfully loaded data from {FUEL_DATA_PATH}")
except FileNotFoundError:
    print(f"Error: File not found at {FUEL_DATA_PATH}")
    print("Please upload your fuel data CSV and update the 'FUEL_DATA_PATH' variable.")
    exit()

# --- 2. Define Features (X) and Target (y) ---
# Based on your feedback, we use only features relevant to a single step
target = "fuel_consumption"
features = [
    'ship_type', 'month', 'distance', 'fuel_type', 
    'weather_conditions', 'engine_efficiency'
]

# Drop 'ship_id', 'route_id', and 'CO2_emissions' (target leakage)
X = df[features]
y = df[target]

# --- 3. Preprocessing ---
# Identify which columns are numerical and which are categorical
numeric_features = ['distance', 'engine_efficiency']
categorical_features = ['ship_type', 'month', 'fuel_type', 'weather_conditions']

# Create the preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- 4. Create the Full Model Pipeline ---
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# --- 5. Train and Evaluate the Model ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nStarting model training...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# Evaluate the model on the test set
y_pred = model_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- Model Evaluation ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# --- 6. Save the Model ---
# We save the *entire* pipeline, including the preprocessor
with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump(model_pipeline, f)

print(f"\n✅ Model pipeline successfully trained and saved to {MODEL_SAVE_PATH}")

Successfully loaded data from ship_fuel_efficiency.csv

Starting model training...
Model training complete.

--- Model Evaluation ---
R-squared (R²): 0.9518
Mean Absolute Error (MAE): 657.9638

✅ Model pipeline successfully trained and saved to fuel_model.pkl


In [3]:
import pandas as pd

# 1. Access the trained RandomForestRegressor model inside the pipeline
model = model_pipeline.named_steps['regressor']

# 2. Get the raw importance scores
importances = model.feature_importances_

# 3. Get the feature names *after* transformation (one-hot encoding, etc.)
# This gets the names from the preprocessor step
feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()

# 4. Combine them into a pandas Series for easy viewing
importances_df = pd.Series(importances, index=feature_names)

# 5. Sort and print the results
print("--- Model Feature Importances ---")
print(importances_df.sort_values(ascending=False))

--- Model Feature Importances ---
num__distance                       0.939463
cat__ship_type_Tanker Ship          0.022764
cat__ship_type_Surfer Boat          0.013120
num__engine_efficiency              0.009001
cat__month_May                      0.001252
cat__month_December                 0.001226
cat__weather_conditions_Calm        0.001161
cat__month_November                 0.001120
cat__month_September                0.001090
cat__month_April                    0.000989
cat__month_June                     0.000978
cat__month_July                     0.000976
cat__weather_conditions_Stormy      0.000878
cat__weather_conditions_Moderate    0.000865
cat__fuel_type_Diesel               0.000852
cat__month_March                    0.000807
cat__fuel_type_HFO                  0.000757
cat__month_January                  0.000698
cat__month_February                 0.000509
cat__ship_type_Fishing Trawler      0.000474
cat__ship_type_Oil Service Boat     0.000399
cat__month_October   

In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

# --- Configuration ---
FUEL_DATA_PATH = "fuel_data.csv" # <-- Your file name
MODEL_SAVE_PATH = "fuel_model_efficiency.pkl" # <-- New model name

# --- 1. Load Data ---
try:
    df = pd.read_csv(FUEL_DATA_PATH)
    print(f"Successfully loaded data from {FUEL_DATA_PATH}")
except FileNotFoundError:
    print(f"Error: File not found at {FUEL_DATA_PATH}")
    exit()

# --- 2. Create the New Target (y) and Features (X) ---
# Create the new target variable: fuel_per_distance
# We'll also filter out any rows where distance is 0 to avoid dividing by zero
df = df[df['distance'] > 0].copy()
df['fuel_per_distance'] = df['fuel_consumption'] / df['distance']
target = 'fuel_per_distance'

# Define features: We explicitly REMOVE distance and the original target
features = [
    'ship_type', 'month', 'fuel_type', 
    'weather_conditions', 'engine_efficiency'
]

X = df[features]
y = df[target]

print(f"\nCreated new target 'fuel_per_distance'.")

# --- 3. Preprocessing ---
numeric_features = ['engine_efficiency']
categorical_features = ['ship_type', 'month', 'fuel_type', 'weather_conditions']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- 4. Create the Full Model Pipeline ---
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# --- 5. Train and Evaluate the Model ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Starting model training...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# Evaluate
y_pred = model_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- Efficiency Model Evaluation ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# --- 6. Save the Model ---
with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump(model_pipeline, f)

print(f"\n✅ Efficiency model successfully trained and saved to {MODEL_SAVE_PATH}")

# --- 7. Check Feature Importances ---
print("\n--- Model Feature Importances (Efficiency Model) ---")
model = model_pipeline.named_steps['regressor']
importances = model.feature_importances_
feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()
importances_df = pd.Series(importances, index=feature_names)
print(importances_df.sort_values(ascending=False))

Successfully loaded data from fuel_data.csv

Created new target 'fuel_per_distance'.
Starting model training...
Model training complete.

--- Efficiency Model Evaluation ---
R-squared (R²): 0.8329
Mean Absolute Error (MAE): 3.3205

✅ Efficiency model successfully trained and saved to fuel_model_efficiency.pkl

--- Model Feature Importances (Efficiency Model) ---
cat__ship_type_Surfer Boat          0.452873
cat__ship_type_Tanker Ship          0.356096
num__engine_efficiency              0.086667
cat__ship_type_Fishing Trawler      0.018221
cat__ship_type_Oil Service Boat     0.012725
cat__weather_conditions_Calm        0.005596
cat__weather_conditions_Moderate    0.005564
cat__weather_conditions_Stormy      0.004980
cat__month_April                    0.004824
cat__month_May                      0.004769
cat__month_June                     0.004687
cat__month_September                0.004647
cat__month_December                 0.004331
cat__month_October                  0.004262
cat__

In [2]:
import pickle
import pandas as pd
import numpy as np

# --- Configuration ---
MODEL_PATH = "fuel_model_efficiency.pkl"
TEST_DISTANCE_KM = 10.0  # Let's test the fuel cost for a 10km step

# --- Load the Model ---
try:
    with open(MODEL_PATH, "rb") as f:
        model = pickle.load(f)
    print(f"✅ Successfully loaded model from {MODEL_PATH}\n")
except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at {MODEL_PATH}")
    print("Please make sure 'fuel_model_efficiency.pkl' is in the same directory.")
    exit()

# --- Define Your Custom Test Cases ---
# The model was trained on these features:
# 'ship_type', 'month', 'fuel_type', 'weather_conditions', 'engine_efficiency'

test_cases = [
    {
        "id": "Tanker in Storm",
        "ship_type": "Tanker Ship",
        "month": "January",
        "fuel_type": "HFO",
        "weather_conditions": "Stormy",
        "engine_efficiency": 90.0
    },
    {
        "id": "Tanker in Calm",
        "ship_type": "Tanker Ship",
        "month": "January",
        "fuel_type": "HFO",
        "weather_conditions": "Calm",
        "engine_efficiency": 90.0
    },
    {
        "id": "Surfer Boat in Storm",
        "ship_type": "Surfer Boat",
        "month": "January",
        "fuel_type": "Diesel",
        "weather_conditions": "Stormy",
        "engine_efficiency": 85.0
    },
    {
        "id": "Surfer Boat in Calm",
        "ship_type": "Surfer Boat",
        "month": "January",
        "fuel_type": "Diesel",
        "weather_conditions": "Calm",
        "engine_efficiency": 85.0
    }
]

# --- Run Predictions ---
for test_case in test_cases:
    print(f"--- Test Case: {test_case['id']} ---")
    
    # 1. Create the input DataFrame (model expects this format)
    # We remove the 'id' field as it's just for logging
    input_data = test_case.copy()
    del input_data['id']
    
    input_df = pd.DataFrame([input_data])
    
    # 2. Predict the Efficieny (Fuel per km)
    try:
        predicted_efficiency = model.predict(input_df)[0]
        
        # 3. Calculate Total Fuel for our test distance
        total_fuel_for_step = predicted_efficiency * TEST_DISTANCE_KM
        
        print(f"  Input: {input_data['weather_conditions']} | {input_data['ship_type']} | {input_data['engine_efficiency']}% eff")
        print(f"  > Predicted Efficiency (Fuel/km): {predicted_efficiency:.4f}")
        print(f"  > Total Fuel for {TEST_DISTANCE_KM}km step: {total_fuel_for_step:.4f}")
        print("-" * (20 + len(test_case['id'])))
        
    except Exception as e:
        print(f"  Error predicting for this case: {e}")

✅ Successfully loaded model from fuel_model_efficiency.pkl

--- Test Case: Tanker in Storm ---
  Input: Stormy | Tanker Ship | 90.0% eff
  > Predicted Efficiency (Fuel/km): 38.9880
  > Total Fuel for 10.0km step: 389.8800
-----------------------------------
--- Test Case: Tanker in Calm ---
  Input: Calm | Tanker Ship | 90.0% eff
  > Predicted Efficiency (Fuel/km): 41.1713
  > Total Fuel for 10.0km step: 411.7134
----------------------------------
--- Test Case: Surfer Boat in Storm ---
  Input: Stormy | Surfer Boat | 85.0% eff
  > Predicted Efficiency (Fuel/km): 13.8916
  > Total Fuel for 10.0km step: 138.9156
----------------------------------------
--- Test Case: Surfer Boat in Calm ---
  Input: Calm | Surfer Boat | 85.0% eff
  > Predicted Efficiency (Fuel/km): 12.9588
  > Total Fuel for 10.0km step: 129.5876
---------------------------------------
