In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [2]:
full_data = pd.read_csv("generated_worker_data.csv")

# Feature engineering
# Convert 'Date' to datetime and extract useful features
full_data['Date'] = pd.to_datetime(full_data['Date'])
full_data['Day_of_Week'] = full_data['Date'].dt.dayofweek
full_data['Month'] = full_data['Date'].dt.month

In [3]:
full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600

# Select features and target
X = full_data[['Day_of_Week', 'Month', 'Working_Hours', 'Crop_Type', 'Base_Hourly_Wage', 'Supply_Demand_Ratio', 'Dynamic_Pricing_Multiplier']]
y = full_data['Total_Earnings']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600
  full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600


In [4]:
numerical_features = ['Day_of_Week', 'Month', 'Working_Hours', 'Base_Hourly_Wage', 'Supply_Demand_Ratio', 'Dynamic_Pricing_Multiplier']
categorical_features = ['Crop_Type']

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [6]:
ann_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=500, random_state=42)


In [7]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', ann_model)])

In [8]:
pipeline.fit(X_train, y_train)



In [9]:
# Predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(0.08925771735435657, 0.9996523898513691)

In [11]:
import pickle

# Export the trained model to a file using pickle
pickle_model_path = "ann_model.pkl"
with open(pickle_model_path, 'wb') as file:
    pickle.dump(pipeline, file)

pickle_model_path


'ann_model.pkl'

In [12]:
import pandas as pd
import pickle

# Example new input data
new_data = {
    'Day_of_Week': [2],  # Tuesday
    'Month': [1],  # January
    'Working_Hours': [8],  # 8 hours
    'Crop_Type': ['Wheat'],  # Crop type
    'Base_Hourly_Wage': [12.00],  # Base hourly wage
    'Supply_Demand_Ratio': [1.2],  # Supply-demand ratio
    'Dynamic_Pricing_Multiplier': [1.44]  # Dynamic pricing multiplier
}

# Convert the new input data into a DataFrame
new_input_df = pd.DataFrame(new_data)

# Load the trained model from the file
with open('ann_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Make predictions using the loaded model
predicted_earnings = loaded_model.predict(new_input_df)

# Output the prediction
print(predicted_earnings)


[137.84203414]


In [None]:
Mean Squared Error (MSE) = 0.08925771735435657: This value represents the average of the squares of the errors or deviations. The error is the amount by which the values predicted by the model differ from the actual values within the dataset. A lower MSE value indicates a better fit of the model to the data. In your case, the MSE is very low, suggesting that the model's predictions are very close to the actual values.

R-squared (R²) = 0.9996523898513691: R² is a statistical measure that represents the proportion of the variance for the dependent variable that's explained by the independent variables in the model. It provides an indication of the goodness of fit of the model. R² values range from 0 to 1, where 1 indicates a perfect fit. An R² value of 0.99965 suggests that the model explains almost all of the variability of the response data around its mean, which is an excellent result.

In summary, these results imply that the model you've trained performs exceptionally well on the test data, with predictions that are very close to the actual values and an almost perfect explanation of the variance in the data. This level of performance is rare in real-world scenarios and may indicate a well-suited model for the problem at hand or a particularly well-behaved dataset. However, it's also worth considering the potential for overfitting, where the model may be too closely tailored to the training data, potentially impacting its generalization to new, unseen data.