In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [30]:
full_data = pd.read_csv("ourdata.csv")

# Feature engineering
# Convert 'Date' to datetime and extract useful features
full_data['Date'] = pd.to_datetime(full_data['Date'])
full_data['Day_of_Week'] = full_data['Date'].dt.dayofweek
full_data['Month'] = full_data['Date'].dt.month
full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600

# Select features and target
X = full_data[['Day_of_Week', 'Month', 'Working_Hours', 'Crop_Type', 'Base_Hourly_Wage', 'Supply_Demand_Ratio', 'Dynamic_Pricing_Multiplier']]
y = full_data['Total_Earnings']



  full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600
  full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
numerical_features = ['Day_of_Week', 'Month', 'Working_Hours', 'Base_Hourly_Wage', 'Supply_Demand_Ratio', 'Dynamic_Pricing_Multiplier']
categorical_features = ['Crop_Type']

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [15]:
ann_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=500, random_state=42)


In [16]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', ann_model)])

In [17]:
pipeline.fit(X_train, y_train)



In [18]:
# Predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(0.08925771735435657, 0.9996523898513691)

In [19]:
import pickle

# Export the trained model to a file using pickle
pickle_model_path = "ann_model.pkl"
with open(pickle_model_path, 'wb') as file:
    pickle.dump(pipeline, file)

pickle_model_path


'ann_model.pkl'

In [20]:
import pandas as pd
import pickle

# Example new input data
new_data = {
    'Day_of_Week': [2],  # Tuesday
    'Month': [1],  # January
    'Working_Hours': [8],  # 8 hours
    'Crop_Type': ['Wheat'],  # Crop type
    'Base_Hourly_Wage': [12.00],  # Base hourly wage
    'Supply_Demand_Ratio': [1.2],  # Supply-demand ratio
    'Dynamic_Pricing_Multiplier': [1.44]  # Dynamic pricing multiplier
}

# Convert the new input data into a DataFrame
new_input_df = pd.DataFrame(new_data)

# Load the trained model from the file
with open('ann_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Make predictions using the loaded model
predicted_earnings = loaded_model.predict(new_input_df)

# Output the prediction
print(predicted_earnings)


[137.84203414]


In [21]:
# Mean Squared Error (MSE) = 0.08925771735435657: This value represents the average of the squares of the errors or deviations. The error is the amount by which the values predicted by the model differ from the actual values within the dataset. A lower MSE value indicates a better fit of the model to the data. In your case, the MSE is very low, suggesting that the model's predictions are very close to the actual values.

# R-squared (R²) = 0.9996523898513691: R² is a statistical measure that represents the proportion of the variance for the dependent variable that's explained by the independent variables in the model. It provides an indication of the goodness of fit of the model. R² values range from 0 to 1, where 1 indicates a perfect fit. An R² value of 0.99965 suggests that the model explains almost all of the variability of the response data around its mean, which is an excellent result.

# In summary, these results imply that the model you've trained performs exceptionally well on the test data, with predictions that are very close to the actual values and an almost perfect explanation of the variance in the data. This level of performance is rare in real-world scenarios and may indicate a well-suited model for the problem at hand or a particularly well-behaved dataset. However, it's also worth considering the potential for overfitting, where the model may be too closely tailored to the training data, potentially impacting its generalization to new, unseen data.

SyntaxError: unterminated string literal (detected at line 1) (2171861709.py, line 1)

APPLYING BOOSTING


In [None]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 330.3 kB/s eta 0:05:02
   ---------------------------------------- 0.1/99.8 MB 469.7 kB/s eta 0:03:33
   ---------------------------------------- 0.1/99.8 MB 950.9 kB/s eta 0:01:45
    --------------------------------------- 1.4/99.8 MB 7.4 MB/s eta 0:00:14
   -- ------------------------------------- 5.6/99.8 MB 25.4 MB/s eta 0:00:04
   ---- ----------------------------------- 10.3/99.8 MB 81.8 MB/s eta 0:00:02
   ----- ----------------------------


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import xgboost as xgb

In [23]:
# Define the XGBoost regressor model

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Create a pipeline that first preprocesses the data, then trains the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', xgb_regressor)])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 8.891071193435837e-10
R^2 Score: 0.9999999999965374


In [None]:
# our model is overfitting so i am goint to apply early stopping training will stop if the validation metric does not improve for 10 consecutive rounds.

In [34]:
full_data = pd.read_csv("ourdata.csv")

# Feature engineering
# Convert 'Date' to datetime and extract useful features
full_data['Date'] = pd.to_datetime(full_data['Date'])
full_data['Day_of_Week'] = full_data['Date'].dt.dayofweek
full_data['Month'] = full_data['Date'].dt.month
full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600

# Select features and target
X = full_data[['Day_of_Week', 'Month', 'Working_Hours', 'Base_Hourly_Wage', 'Supply_Demand_Ratio', 'Dynamic_Pricing_Multiplier']]
y = full_data['Total_Earnings']


  full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600
  full_data['Working_Hours'] = (pd.to_datetime(full_data['Time_End']) - pd.to_datetime(full_data['Time_Start'])).dt.seconds / 3600


In [35]:
from sklearn.metrics import mean_squared_error
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert the datasets into DMatrix, which is a high-performance XGBoost data structure
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 4,
    'alpha': 10,
    'learning_rate': 0.01,
}

# Train the model with early stopping
eval_set = [(dtrain, 'train'), (dval, 'eval')]
model = xgb.train(params, dtrain, num_boost_round=1000, evals=eval_set, early_stopping_rounds=10, verbose_eval=True)

# Predictions
y_pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

[0]	train-rmse:15.89504	eval-rmse:15.56817
[1]	train-rmse:15.74558	eval-rmse:15.42139
[2]	train-rmse:15.59770	eval-rmse:15.27666
[3]	train-rmse:15.45126	eval-rmse:15.13307
[4]	train-rmse:15.30640	eval-rmse:14.99183
[5]	train-rmse:15.16291	eval-rmse:14.85114
[6]	train-rmse:15.02095	eval-rmse:14.71224
[7]	train-rmse:14.88031	eval-rmse:14.57468
[8]	train-rmse:14.74114	eval-rmse:14.43801


[9]	train-rmse:14.60331	eval-rmse:14.30358
[10]	train-rmse:14.46690	eval-rmse:14.16963
[11]	train-rmse:14.33180	eval-rmse:14.03788
[12]	train-rmse:14.19713	eval-rmse:13.90519
[13]	train-rmse:14.06457	eval-rmse:13.77593
[14]	train-rmse:13.93246	eval-rmse:13.64576
[15]	train-rmse:13.80171	eval-rmse:13.51768
[16]	train-rmse:13.67284	eval-rmse:13.39182
[17]	train-rmse:13.54456	eval-rmse:13.26600
[18]	train-rmse:13.41815	eval-rmse:13.14224
[19]	train-rmse:13.29231	eval-rmse:13.01880
[20]	train-rmse:13.16831	eval-rmse:12.89741
[21]	train-rmse:13.04486	eval-rmse:12.77632
[22]	train-rmse:12.92269	eval-rmse:12.65595
[23]	train-rmse:12.80212	eval-rmse:12.53791
[24]	train-rmse:12.68227	eval-rmse:12.42052
[25]	train-rmse:12.56402	eval-rmse:12.30476
[26]	train-rmse:12.44642	eval-rmse:12.18940
[27]	train-rmse:12.33005	eval-rmse:12.07527
[28]	train-rmse:12.21504	eval-rmse:11.96269
[29]	train-rmse:12.10088	eval-rmse:11.85072
[30]	train-rmse:11.98806	eval-rmse:11.74029
[31]	train-rmse:11.87607	eval-rms

AttributeError: 'Booster' object has no attribute 'best_ntree_limit'