In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/Vegetable Price Prediction .csv')
df.head()

Unnamed: 0,Commodity,Date,Unit,Average Price
0,Tomato Big(Nepali),1/5/2021,Kg,55.0
1,Tomato Big(Indian),1/5/2021,Kg,55.0
2,Tomato Small(Local),1/5/2021,Kg,32.5
3,Tomato Small(Tunnel),1/5/2021,Kg,32.5
4,Tomato Small(Indian),1/5/2021,KG,42.5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96480 entries, 0 to 96479
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Commodity      96480 non-null  object
 1   Date           96480 non-null  object
 2   Unit           96480 non-null  object
 3   Average Price  96480 non-null  object
dtypes: object(4)
memory usage: 2.9+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
Commodity,0
Date,0
Unit,0
Average Price,0


In [None]:
df.drop('Unit', axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,Commodity,Date,Average Price
0,Tomato Big(Nepali),1/5/2021,55.0
1,Tomato Big(Indian),1/5/2021,55.0
2,Tomato Small(Local),1/5/2021,32.5
3,Tomato Small(Tunnel),1/5/2021,32.5
4,Tomato Small(Indian),1/5/2021,42.5


In [None]:
df['Commodity'].unique()

array(['Tomato Big(Nepali)', 'Tomato Big(Indian)', 'Tomato Small(Local)',
       'Tomato Small(Tunnel)', 'Tomato Small(Indian)',
       'Tomato Small(Terai)', 'Potato Red', 'Potato Red(Indian)',
       'Potato White', 'Onion Dry (Indian)', 'Carrot(Local)',
       'Carrot(Terai)', 'Cabbage(Local)', 'Cabbage(Terai)', 'Cabbage',
       'Cauli Local', 'Cauli Local(Jyapu)', 'Cauli Terai', 'Raddish Red',
       'Raddish White(Local)', 'Raddish White(Hybrid)', 'Brinjal Long',
       'Brinjal Round', 'Green Peas', 'French Bean(Local)',
       'French Bean(Hybrid)', 'Sword Bean', 'Bitter Gourd',
       'Bottle Gourd', 'Pumpkin', 'Squash(Long)', 'Squash(Round)',
       'Turnip', 'Okara', 'Sweet Potato', 'Barela', 'Arum',
       'Christophine', 'Brd Leaf Mustard', 'Spinach Leaf', 'Cress Leaf',
       'Mustard Leaf', 'Fenugreek Leaf', 'Onion Green', 'Bakula', 'Yam',
       'Mushroom(Kanya)', 'Mushroom(Button)', 'Brocauli', 'Sugarbeet',
       'Red Cabbbage', 'Lettuce', 'Knolkhol', 'Celery', 'Parse

In [None]:
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')

In [None]:
df.head()

Unnamed: 0,Commodity,Date,Unit,Average Price
0,Tomato Big(Nepali),2021-05-01,Kg,55.0
1,Tomato Big(Indian),2021-05-01,Kg,55.0
2,Tomato Small(Local),2021-05-01,Kg,32.5
3,Tomato Small(Tunnel),2021-05-01,Kg,32.5
4,Tomato Small(Indian),2021-05-01,KG,42.5


In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

discrete_features = [feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :', len(discrete_features))

continuous_features = [feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :', len(continuous_features))

Num of Numerical Features : 2
Num of Categorical Features : 2
Num of Discrete Features : 0
Num of Continuous Features : 2


In [None]:
# Drop the 'Unit' column
df = df.drop(columns='Unit')

In [None]:
df['Average Price'] = pd.to_numeric(df['Average Price'], errors='coerce')

In [None]:
df.head()

Unnamed: 0,Commodity,Date,Average Price
0,Tomato Big(Nepali),2021-05-01,55.0
1,Tomato Big(Indian),2021-05-01,55.0
2,Tomato Small(Local),2021-05-01,32.5
3,Tomato Small(Tunnel),2021-05-01,32.5
4,Tomato Small(Indian),2021-05-01,42.5


In [None]:
# Step 1: Check for missing (NaN) values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 Commodity            0
Date             58680
Average Price    51156
dtype: int64


In [None]:
print("Total rows in data:", len(df))

Total rows in data: 96480


In [None]:
df_cleaned = df.dropna()

In [None]:
print(df_cleaned)

                  Commodity       Date  Average Price
0        Tomato Big(Nepali) 2021-05-01           55.0
1        Tomato Big(Indian) 2021-05-01           55.0
2       Tomato Small(Local) 2021-05-01           32.5
3      Tomato Small(Tunnel) 2021-05-01           32.5
4      Tomato Small(Indian) 2021-05-01           42.5
...                     ...        ...            ...
44730           Clive Green 2022-12-04          125.0
44731      Fish Fresh(Rahu) 2022-12-04          340.0
44732   Fish Fresh(Bachuwa) 2022-12-04          310.0
44733    Fish Fresh(Chhadi) 2022-12-04          310.0
44734   Fish Fresh(Mungari) 2022-12-04          310.0

[18087 rows x 3 columns]


In [None]:
df['Date'] = df['Date'].fillna('2021-05-01')  # or use any logic like today's date

In [None]:
# Option 1: Fill with column average
df['Average Price'] = df['Average Price'].fillna(df['Average Price'].mean())

# Option 2: Fill with 0 if you want to mark them as "to be updated"
# df['Average Price'] = df['Average Price'].fillna(0)


In [None]:
print(df.isnull().sum())  # should now show 0 missing values

Commodity        0
Date             0
Average Price    0
dtype: int64


In [None]:
print("Total rows in data:", len(df_cleaned))

Total rows in data: 18087


In [None]:
'''# Convert 'Average Price' to numeric, invalid strings become NaN
df['Average Price'] = pd.to_numeric(df['Average Price'], errors='coerce')

# Now check how many became NaN
missing_prices = df['Average Price'].isnull().sum()
print(f"❗ Invalid/non-numeric prices turned into NaN: {missing_prices}")

# Fill those NaN values with the mean
mean_price = df['Average Price'].mean()
df['Average Price'] = df['Average Price'].fillna(mean_price)'''


In [None]:
# Step 1: Convert non-numeric features to numeric using one-hot encoding
df_encoded = pd.get_dummies(df_cleaned, columns=['Commodity', 'Date'], drop_first=True)

In [None]:
# Step 2: Define features and target
from sklearn.model_selection import train_test_split
X = df_encoded.drop(['Average Price'], axis=1)
y = df_encoded['Average Price']

In [None]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14469, 314), (3618, 314), (14469,), (3618,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [None]:
pd.DataFrame(X)

Unnamed: 0,Commodity_Apple(Fuji),Commodity_Apple(Jholey),Commodity_Arum,Commodity_Asparagus,Commodity_Avocado,Commodity_Bakula,Commodity_Bamboo Shoot,Commodity_Banana,Commodity_Barela,Commodity_Bauhania flower,...,Date_2022-10-03 00:00:00,Date_2022-10-04 00:00:00,Date_2022-11-01 00:00:00,Date_2022-11-02 00:00:00,Date_2022-11-03 00:00:00,Date_2022-11-04 00:00:00,Date_2022-12-01 00:00:00,Date_2022-12-02 00:00:00,Date_2022-12-03 00:00:00,Date_2022-12-04 00:00:00
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44730,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
44731,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
44732,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
44733,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Function to evaluate model performance
def evaluated_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)  # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

In [None]:
# Dictionary of models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor": AdaBoostRegressor(),
    "Gradient Boost Regressor": GradientBoostingRegressor(),
    "XGBoost Regressor": XGBRegressor()
}

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Function to evaluate model performance
def evaluated_model(y_true, y_pred):
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Calculate R2 Score
    r2 = r2_score(y_true, y_pred)

    return mae, rmse, r2


In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Updated models dictionary with only regression models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor": AdaBoostRegressor(),
    "Gradient Boost Regressor": GradientBoostingRegressor(),
    "XGBoost Regressor": XGBRegressor()
}

# Loop over models
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate the model performance
    model_train_mae, model_train_rmse, model_train_r2 = evaluated_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluated_model(y_test, y_test_pred)

    # Print model performance
    print(model_name)

    print('Model performance for Training set:')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------------')

    print('Model performance for Test set:')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    print('='*35)
    print('\n')


Linear Regression
Model performance for Training set:
- Root Mean Squared Error: 55.7400
- Mean Absolute Error: 31.0297
- R2 Score: 0.7976
----------------------------------------
Model performance for Test set:
- Root Mean Squared Error: 60.4742
- Mean Absolute Error: 32.8017
- R2 Score: 0.7720


Lasso
Model performance for Training set:
- Root Mean Squared Error: 89.0460
- Mean Absolute Error: 64.7708
- R2 Score: 0.4834
----------------------------------------
Model performance for Test set:
- Root Mean Squared Error: 92.4902
- Mean Absolute Error: 66.6373
- R2 Score: 0.4667


Ridge
Model performance for Training set:
- Root Mean Squared Error: 55.7926
- Mean Absolute Error: 31.0626
- R2 Score: 0.7972
----------------------------------------
Model performance for Test set:
- Root Mean Squared Error: 60.4551
- Mean Absolute Error: 32.7776
- R2 Score: 0.7722


K-Neighbors Regressor
Model performance for Training set:
- Root Mean Squared Error: 71.4788
- Mean Absolute Error: 40.2139
- R

In [None]:
rf_params = {"max_depth": [5, 8, 15, None, 10],
            "max_features": [5, 7, "auto", 8],
            "min_samples_split": [2, 8, 15, 20],
            "n_estimators": [100, 200, 500, 1000]}

xgboost_params={"colsample_bytree":[0.5, 0.8, 1, 0.3, 0.4],
                 "n_estimators":[100, 200, 300],
                 "max_depth":[5, 8, 12, 20, 30],
                "learning_rate":[0.1, 0.01]}

In [None]:
randomcv_models = [
    ("RF", RandomForestRegressor(), rf_params),
    ("XGBoost", XGBRegressor(), xgboost_params)
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}

for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                               param_distributions=params,
                               n_iter=100,
                               cv=3,
                               verbose=2,
                               n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"----------------- Best Params for {model_name} -----------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


63 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
51 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Fitting 3 folds for each of 100 candidates, totalling 300 fits
----------------- Best Params for RF -----------------
{'n_estimators': 200, 'min_samples_split': 20, 'max_features': 8, 'max_depth': None}
----------------- Best Params for XGBoost -----------------
{'n_estimators': 300, 'max_depth': 12, 'learning_rate': 0.1, 'colsample_bytree': 0.3}
