In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("Training_test_ratnagiri.csv")

# Define the non-quantitative columns
non_quantitative_columns = ["Date", "Phase"]

# Select quantitative columns (features) excluding non-quantitative ones
quantitative_columns = df.columns.difference(non_quantitative_columns + ["Irrigation Req"])

# Label encode the 'Phase' column since it is non-quantitative
le = LabelEncoder()
df['Phase'] = le.fit_transform(df['Phase'])

# Optional: Convert 'Date' column to datetime type and extract useful features like year, month, day, etc.
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Add these new date features to the list of features
quantitative_columns = quantitative_columns.tolist() + ['Year', 'Month', 'Day']

# Define the features (X) and target (y)
X = df[quantitative_columns]
y = df['Irrigation Req']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Optionally, display feature importances
feature_importances = pd.Series(model.feature_importances_, index=quantitative_columns)
print(feature_importances.sort_values(ascending=False))


Mean Squared Error: 2807029.0765551985
Kc                        0.264443
Etc                       0.146185
Volume(Etc)               0.119585
IWR(Soil Moisture)        0.105826
soil moisture(%)          0.069403
Soil Moisture Diff        0.068284
Volume(Base)              0.041426
Root Zone Depth           0.032739
Rainfall                  0.028065
RZD( in m)                0.024004
Required Soil Moisture    0.023521
 Eto                      0.018209
Days                      0.017974
Month                     0.016322
Volume (Rainfall)         0.011760
Humidity(%)               0.010794
Day                       0.001460
Year                      0.000000
dtype: float64


In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("Training_test_ratnagiri2.csv")

# # Display column names to verify
# print(df.columns)

# Define the non-quantitative columns
non_quantitative_columns = ["Date", "Phase"]

# Specify the columns you want to include as features (match these to the actual column names in your DataFrame)
selected_feature_columns = [
    "Days", "soil moisture(%)", "Required Soil Moisture", "Soil Moisture Diff ", 
    "Kc", " Eto", "Etc", "Rainfall", "Humidity(%)", "Root Zone Depth", "RZD( in m)",
    "Volume(Base)", "Volume (Rainfall)", "Volume(Etc)", "IWR(Soil Moisture)"
]

# Label encode the 'Phase' column since it is non-quantitative
le = LabelEncoder()
df['Phase'] = le.fit_transform(df['Phase'])

# Convert 'Date' column to datetime type with dayfirst=True
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

# Optional: Extract useful features like year, month, day, etc. (if you want to use them)
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# If you decide to include any date-based features, add them to selected_feature_columns
# selected_feature_columns += ['Year', 'Month', 'Day']

# Check that all selected feature columns exist in the DataFrame
missing_columns = [col for col in selected_feature_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: The following columns are missing from the dataset: {missing_columns}")
else:
    # Define the features (X) and target (y)
    X = df[selected_feature_columns]
    y = df['Irrigation Req']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")

    # Optionally, display feature importances
    feature_importances = pd.Series(model.feature_importances_, index=selected_feature_columns)
    print(feature_importances.sort_values(ascending=False))


Mean Squared Error: 1964.9669264046092
soil moisture(%)          0.181167
IWR(Soil Moisture)        0.180872
Kc                        0.132857
Volume (Rainfall)         0.090532
Rainfall                  0.082758
Soil Moisture Diff        0.074870
RZD( in m)                0.054536
Root Zone Depth           0.044909
Required Soil Moisture    0.042724
Volume(Base)              0.042601
Volume(Etc)               0.028919
Etc                       0.019810
 Eto                      0.014556
Humidity(%)               0.008089
Days                      0.000799
dtype: float64


In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data from CSV file
data = pd.read_csv('Training_test_ratnagiri2.csv')

# Ensure all features are numeric or encoded
# Identify categorical columns
categorical_cols = ['Phase']

# Convert categorical columns to string (if they are not already)
for col in categorical_cols:
    data[col] = data[col].astype(str)

# Separate features and target variable
X = data.drop(columns=['Irrigation Req'])
y = data['Irrigation Req']

# Create a column transformer with OneHotEncoder for categorical columns and StandardScaler for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        ('scaler', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns)  # Standardize numerical features
    ]
)

# Create a pipeline with the preprocessor and the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print coefficients and intercept
print("Coefficients:", pipeline.named_steps['regressor'].coef_)
print("Intercept:", pipeline.named_steps['regressor'].intercept_)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data from CSV file
data = pd.read_csv('Training_test_ratnagiri2.csv')

# Ensure all features are numeric or encoded
# Identify categorical columns
categorical_cols = ['Phase']

# Convert categorical columns to string (if they are not already)
for col in categorical_cols:
    data[col] = data[col].astype(str)

# Separate features and target variable
X = data.drop(columns=['Irrigation Req'])
y = data['Irrigation Req']

# Create a column transformer with OneHotEncoder for categorical columns, 
# SimpleImputer to handle missing values, and StandardScaler for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values for categorical columns
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))  # Encode categorical columns
        ]), categorical_cols),
        ('scaler', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values for numerical columns
            ('scaler', StandardScaler())  # Scale numerical features
        ]), X.select_dtypes(include=['float64', 'int64']).columns)
    ]
)

# Create a pipeline with the preprocessor and the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print coefficients and intercept
print("Coefficients:", pipeline.named_steps['regressor'].coef_)
print("Intercept:", pipeline.named_steps['regressor'].intercept_)


Mean Squared Error: 20753.01675985483
Coefficients: [-7.41936336e+00  5.81347609e+03  1.92166624e+03 -2.47426933e+04
 -1.02714847e+03  2.24100314e+04 -6.81878340e+02  1.51970980e+02
  9.65839033e+01 -1.76837378e+10  1.21746475e+03 -7.51618006e+00
 -1.21583651e+03  3.85512633e+03 -2.00459373e+03 -1.34586071e+03
  1.76837376e+10  1.38092230e+02]
Intercept: -3853.5628550701076
