<a href="https://www.kaggle.com/code/alawodesharon/predictive-modeling-for-agriculture?scriptVersionId=210217049" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [35]:
# importing required libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [36]:
# Load the dataset
crops = pd.read_csv("/kaggle/input/agricuture-dataset/workspace/soil_measures.csv")

In [37]:
# read the data into pandas dataframe
crops.head()

Unnamed: 0,N,P,K,ph,crop
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice


In [38]:
crops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       2200 non-null   int64  
 1   P       2200 non-null   int64  
 2   K       2200 non-null   int64  
 3   ph      2200 non-null   float64
 4   crop    2200 non-null   object 
dtypes: float64(1), int64(3), object(1)
memory usage: 86.1+ KB


In [39]:
crops.isnull().sum()

N       0
P       0
K       0
ph      0
crop    0
dtype: int64

In [40]:
crops.dtypes

N         int64
P         int64
K         int64
ph      float64
crop     object
dtype: object

In [41]:
# split the data 
X = crops.drop(columns=['crop'])
y = crops['crop'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
# Scale the training and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
# evaluate feature performance 
best_score = 0
best_feature = None
for feature in X.columns: 
    X_train_single = X_train_scaled[:, [X.columns.get_loc(feature)]]
    X_test_single = X_test_scaled[:, [X.columns.get_loc(feature)]] 
    model = LogisticRegression(max_iter=5000)
    model.fit(X_train_single, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_single)
    
    # Evaluate the model
    score = accuracy_score(y_test, y_pred)
    
    # Check if this feature has the best score so far
    if score > best_score:
        best_score = score
        best_feature = feature
# best preditive feature-variable 
best_predictive_feature = {best_feature: best_score}
print(best_predictive_feature)

{'K': 0.2409090909090909}


In [48]:
print(f"Accuracy with all features: {score}")

Accuracy with all features: 0.11212121212121212


In [49]:
print(y.value_counts(normalize=True))

crop
rice           0.045455
maize          0.045455
jute           0.045455
cotton         0.045455
coconut        0.045455
papaya         0.045455
orange         0.045455
apple          0.045455
muskmelon      0.045455
watermelon     0.045455
grapes         0.045455
mango          0.045455
banana         0.045455
pomegranate    0.045455
lentil         0.045455
blackgram      0.045455
mungbean       0.045455
mothbeans      0.045455
pigeonpeas     0.045455
kidneybeans    0.045455
chickpea       0.045455
coffee         0.045455
Name: proportion, dtype: float64


In [50]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
print(feature_importances.sort_values(ascending=False))

K     0.323993
P     0.253175
N     0.217651
ph    0.205181
dtype: float64
