In [None]:
!pip install numpy pandas scikit-learn matplotlib seaborn




In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp -r "/content/drive/My Drive/Collab Dataset/farmer" "/content/"


In [None]:
!ls "/content/drive/My Drive/Collab Dataset/farmer"


Crop_Recommendation.csv


In [None]:
!cat "/content/drive/My Drive/Collab Dataset/farmer/Crop_Recommendation.csv" | head -5


Nitrogen,Phosphorus,Potassium,Temperature,Humidity,pH_Value,Rainfall,Crop
90,42,43,20.87974371,82.00274423,6.502985292,202.9355362,Rice
85,58,41,21.77046169,80.31964408,7.038096361,226.6555374,Rice
60,55,44,23.00445915,82.3207629,7.840207144,263.9642476,Rice
74,35,40,26.49109635,80.15836264,6.980400905,242.8640342,Rice


In [None]:
data = pd.read_csv("/content/drive/My Drive/Collab Dataset/farmer/Crop_Recommendation.csv")
print(type(data))  # Should show <class 'pandas.core.frame.DataFrame'>


<class 'pandas.core.frame.DataFrame'>


In [None]:
data.head()

Unnamed: 0,Nitrogen,Phosphorus,Potassium,Temperature,Humidity,pH_Value,Rainfall,Crop
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Rice


In [None]:
# Check for missing values
print(data.isnull().sum())


Nitrogen       0
Phosphorus     0
Potassium      0
Temperature    0
Humidity       0
pH_Value       0
Rainfall       0
Crop           0
dtype: int64


In [None]:
data.drop_duplicates(inplace=True)


In [None]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Nitrogen     2200 non-null   int64  
 1   Phosphorus   2200 non-null   int64  
 2   Potassium    2200 non-null   int64  
 3   Temperature  2200 non-null   float64
 4   Humidity     2200 non-null   float64
 5   pH_Value     2200 non-null   float64
 6   Rainfall     2200 non-null   float64
 7   Crop         2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['label_encoded'] = encoder.fit_transform(data['Crop'])  # Use 'Crop' instead of 'label'
data.head()


Unnamed: 0,Nitrogen,Phosphorus,Potassium,Temperature,Humidity,pH_Value,Rainfall,Crop,label_encoded
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice,20
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Rice,20
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Rice,20
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Rice,20
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Rice,20


In [None]:
features = data[['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall']]
target = data['label_encoded']


In [None]:
type(data)



In [None]:
data = data.drop_duplicates(subset=['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall', 'Crop'])


In [None]:
print(data['Crop'].value_counts())


Crop
Rice           100
Maize          100
ChickPea       100
KidneyBeans    100
PigeonPeas     100
MothBeans      100
MungBean       100
Blackgram      100
Lentil         100
Pomegranate    100
Banana         100
Mango          100
Grapes         100
Watermelon     100
Muskmelon      100
Apple          100
Orange         100
Papaya         100
Coconut        100
Cotton         100
Jute           100
Coffee         100
Name: count, dtype: int64


In [None]:
unique_data = data.drop_duplicates(subset=['Crop']).reset_index(drop=True)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

scaler = StandardScaler()
scaled_features = scaler.fit_transform(unique_data[['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall']])

similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=unique_data['Crop'], columns=unique_data['Crop'])


In [None]:
from sklearn.metrics import pairwise_distances

similarity_matrix = 1 / (1 + pairwise_distances(scaled_features, metric='euclidean'))
similarity_df = pd.DataFrame(similarity_matrix, index=data['Crop'], columns=data['Crop'])


In [None]:
import numpy as np

def recommend_crop(crop_name, num_recommendations=3):
    if crop_name not in similarity_df.index:
        print(f"{crop_name} not found in dataset.")
        return []

    similar_crops = similarity_df.loc[crop_name]

    # Fix the shape of the noise vector
    noise = np.random.uniform(0, 0.01, size=len(similar_crops))
    similar_crops += noise

    similar_crops = similar_crops.sort_values(ascending=False)  # Sort after adding noise

    recommendations = similar_crops.index[:num_recommendations].tolist()

    return recommendations


In [None]:
recommendations = recommend_crop('Rice')
print("Recommended Crops:", recommendations)


Recommended Crops: ['Rice', 'Jute', 'Coffee']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF matrix based on the 'Crop' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(unique_data['Crop'])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

content_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
content_sim_df = pd.DataFrame(content_sim, index=unique_data['Crop'], columns=unique_data['Crop'])


In [None]:
import numpy as np

def recommend_crop_content_based(crop_name, num_recommendations=3):
    if crop_name not in content_sim_df.index:
        print(f"{crop_name} not found in dataset.")
        return []

    similar_crops = content_sim_df.loc[crop_name].sort_values(ascending=False).drop(crop_name)

    # Add noise to prevent identical recommendations
    noise = np.random.uniform(0, 0.01, size=len(similar_crops))
    similar_crops += noise

    similar_crops = similar_crops.sort_values(ascending=False)

    recommendations = similar_crops.index[:num_recommendations].tolist()

    return recommendations


In [None]:
recommendations = recommend_crop_content_based('Rice')
print("Recommended Crops (Content-Based):", recommendations)


Recommended Crops (Content-Based): ['PigeonPeas', 'Orange', 'Blackgram']


In [None]:
# Combine feature-based and content-based similarity with weights
combined_similarity = 0.6 * similarity_df + 0.4 * content_sim_df


In [None]:
def recommend_crop_hybrid(crop_name, num_recommendations=3):
    if crop_name not in combined_similarity.index:
        print(f"{crop_name} not found in dataset.")
        return []

    similar_crops = combined_similarity.loc[crop_name].sort_values(ascending=False).drop(crop_name)

    # Add small noise for variety
    noise = np.random.uniform(0, 0.01, size=len(similar_crops))
    similar_crops += noise

    similar_crops = similar_crops.sort_values(ascending=False)

    recommendations = similar_crops.index[:num_recommendations].tolist()

    return recommendations


In [None]:
recommendations = recommend_crop_hybrid('Rice')
print("Recommended Crops (Hybrid):", recommendations)


Recommended Crops (Hybrid): ['Jute', 'Coffee', 'Coconut']


In [None]:
from sklearn.model_selection import train_test_split

X = data[['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall']]
y = data['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)


In [None]:
y_pred = rf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Make predictions on the test data
# y_pred = rf_classifier_optimized.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

# Generate and print the classification report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", class_report)

Accuracy: 0.9931818181818182
Precision: 0.9937348484848485
Recall: 0.9931818181818182
F1-score: 0.9931754816901672

Confusion Matrix:
 [[23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 27  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 23  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           scoring='f1_weighted')

grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [None]:
best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_rf.fit(X_train, y_train)


In [None]:
y_pred = best_rf.predict(X_test)

print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))


Precision: 0.9937348484848485
Recall: 0.9931818181818182
F1-Score: 0.9931754816901672


In [None]:
import joblib

joblib.dump(best_rf, 'crop_recommendation_model.pkl')


['crop_recommendation_model.pkl']

In [None]:
import joblib

# Load the saved model
model = joblib.load('crop_recommendation_model.pkl')


In [None]:
test_input = [[75, 35, 40, 26.5, 80, 6.8, 240]]


In [None]:
test_input_scaled = scaler.transform(test_input)


In [None]:
predicted_label = model.predict(test_input_scaled)[0]
predicted_crop = encoder.inverse_transform([predicted_label])[0]


In [None]:
recommendations = recommend_crop(predicted_crop)
print(f"Predicted Crop: {predicted_crop}")
print(f"Recommended Crops (Content-Based): {recommendations}")


Predicted Crop: Muskmelon
Recommended Crops (Content-Based): ['Muskmelon', 'Watermelon', 'Cotton']
