In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

df = pd.read_csv("crop_recommendation.csv")

df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


## Preprocessing 

In [2]:
X = df[df.columns[:-1]]
X

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,117,32,34,26.272418,52.127394,6.758793,127.175293


In [3]:
y = df["label"]
y

0         rice
1         rice
2         rice
3         rice
4         rice
         ...  
2195    coffee
2196    coffee
2197    coffee
2198    coffee
2199    coffee
Name: label, Length: 2200, dtype: object

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [5]:
Y = le.fit_transform(y)

In [6]:
df["label"] = Y

In [7]:
df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,20
1,85,58,41,21.770462,80.319644,7.038096,226.655537,20
2,60,55,44,23.004459,82.320763,7.840207,263.964248,20
3,74,35,40,26.491096,80.158363,6.980401,242.864034,20
4,78,42,42,20.130175,81.604873,7.628473,262.717340,20
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,5
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,5
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,5
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,5


## Analysing the data

In [9]:
profile= ProfileReport(df)

In [10]:
profile.to_file("reports.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=145)

## Training the model with multiple model and finding the accuracy

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# create instances of all models
models = {
    'Logistic Regression': LogisticRegression(solver="liblinear"),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
}

from sklearn.metrics import accuracy_score
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f'{name}:\nAccuracy: {acc:.4f}')
    
    


Logistic Regression:
Accuracy: 0.9545
Support Vector Machine:
Accuracy: 0.9727
K-Nearest Neighbors:
Accuracy: 0.9764
Decision Tree:
Accuracy: 0.9836
Random Forest:
Accuracy: 0.9982
AdaBoost:
Accuracy: 0.2000
Gradient Boosting:
Accuracy: 0.9855


 On training the different models we can clearly see the random forest is the best model for this problem statement 


## Training the model using Random forest classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
rdf = RandomForestClassifier()
rdf.fit(X_train.to_numpy(),y_train)


In [15]:
from sklearn.metrics import accuracy_score

y_pred = rdf.predict(X_test.to_numpy())
print(accuracy_score(y_test,y_pred))

0.9963636363636363


## Saving the model to file for later use

In [16]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(rdf, f)

## Loading the model

In [17]:
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

model

In [20]:
y_pred = model.predict(X_test.to_numpy())
print(accuracy_score(y_test,y_pred))

0.9963636363636363


In [21]:
y_pred = model.predict([[90, 42, 43, 20.879744, 82.002744, 6.502985, 202.935536]], )
y_pred

pred_label = le.inverse_transform(y_pred)[0]
pred_label

'rice'

In [22]:
np.save('classes.npy', le.classes_)

In [23]:
encoder = LabelEncoder()
encoder.classes_ = np.load('classes.npy', allow_pickle=True)

In [24]:
pred_label = encoder.inverse_transform(y_pred)[0]
pred_label

'rice'