In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'smart-agricultural-production-optimizing-engine:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2088120%2F3468346%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240513%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240513T120948Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dad210a82b4855fbccfa5caf48c35afc8158daccfe8067d0c99bb3b78f8692c94b81610dd78067b100cb6c2787bb71a3c9234ef2691a2884a0ddca220c061ac5b7c4f12a00bd65845c0ab51600e39fa096c1367825966958e2c0f922cc26c5818f329b658719c03e76dec12cbcfa40e12b446f446696b8800f6a8f9d5484eeffd1fcc6877ebf5ae5faeb8353ad29be9d903ba3fd19c196a8850dffa3bdf9ae4ec61735f0c79b3d4aac0d2790e55bce42e49b1f5e17ed034027c27b31a3e9e31b581b7693870670d358523ccd212f37e769d6629a8562261cc2ea54d6582dd0af772055868fb09456772988b5ad749e2855b60e27c4a44a389203d62d24ab18eb4'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


<div style="display:fill;
            border-radius:15px;
            background-color:#00bd35;
            font-size:210%;
            font-family:cursive;
            letter-spacing:0.5px;
            padding:10px;
            color:white;
            border-style: solid;
            border-color: black;
            text-align:center;">
<b>
üå±üçéSuitable crop for suitable soil üåæüåø</b>
</div>

<h1><b>1 <span style='color:#00bd35;'>|</span> Importing Libraries and Loading dataset</b></h1>

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>1.1 | Names and uses</b></p>
</div>

<ul>
    <li style="font-size:15px"><h4 style="line-height:25px"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Pandas</b></mark> for handling the data. It contains n number of function for data handling.<a href="https://github.com/Dhamu785/py/tree/master/pandas"> Refer</a></h4></li>
    <li style="font-size:15px"><h4 style="line-height:25px"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Numpy</b></mark> for creating and handling the array.</h4></li>
    <li style="font-size:15px"><h4 style="line-height:25px">
        <mark style="background-color:#00bd35;color:white;border-radius:4px;">
            <b>Matplotlib</b>
        </mark>
        for visualizing the data. It also hulpfull in finding patterns in the dataset.<a href="https://www.kaggle.com/dhamur/matplotlib-different-charts"> Refer</a></h4></li>
    <li style="font-size:15px"><h4 style="line-height:25px"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Seaborn</b></mark> is a library for making statistical graphics in Python. It builds on top of matplotlib and integrates closely with pandas data structures. Seaborn helps you explore and understand your data.</h4></li>
    <li style="font-size:15px"><h4 style="line-height:25px"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Scikit-learn (Sklearn)</b></mark> is the most useful and robust library for machine learning in Python. It provides a selection of efficient tools for <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>machine learning and statistical modeling</b></mark> including classification, regression, clustering and dimensionality reduction via a consistence interface in Python.</h4></li>

</ul>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import BaggingClassifier

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>1.2 | Loading and reading the data</b></p>
</div>
<h4 style="line-height:30px">This is dataset which is used to recommend the crop for the suitable soil. This will be very useful in crop production (Agriculture) without looses based on soli ph, rainfall, humadity and other chemical components present in the soil. </h4>

In [None]:
df = pd.read_csv('../input/smart-agricultural-production-optimizing-engine/Crop_recommendation.csv')

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>1.3 | About the data</b></p>
</div>
<ul>
    <li><h4 style="line-height:25px"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Nitrogen</b></mark> is so vital because it is a <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>major component of chlorophyll</b></mark>, the compound by which plants use sunlight energy to produce sugars from water and carbon dioxide (i.e., photosynthesis). It is also a major component of amino acids, the building blocks of proteins. Without proteins, plants wither and die.</h4></li>
    <hr>
    <li><h4 style="line-height:25px;"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Phosphorus</b></mark> is, therefore, important in <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>cell division and development of new tissue.</b></mark> Phosphorus is also associated with complex energy transformations in the plant. Adding phosphorus to soil low in available phosphorus promotes root growth and winter hardiness, stimulates tillering, and often hastens maturity.</h4></li>
    <hr>
    <li><h4 style="line-height:25px"><mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Potassium</b></mark> is a critical nutrient that plants absorb from the soil, and from fertilizer. It <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>increases disease resistance</b></mark>, helps stalks to grow upright and sturdy, improves drought tolerance, and helps plants get through the winter.</h4></li>
    <hr>
    <li><h4 style="line-height:25px">The average <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>soil temperatures</b></mark> for bioactivity range from 50 to 75F. These values are favorable for normal life functions of earth biota that ensure proper organic matter decomposition, <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>increased nitrogen mineralization</b></mark>, uptake of soluble substances, and metabolism.</h4></li>
    <hr>
    <li><h4 style="line-height:25px">The pH range <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>5.5‚Äì6.5</b></mark> is optimal for plant growth as the availability of nutrients is optimal.</h4></li>
    <hr>
    <li><h4 style="line-height:25px">Besides disease, <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>rainfall</b></mark> can also determine how fast a crop will grow from seed, including when it will be ready for harvesting. A good balance of rain and proper irrigation can lead to <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>faster-growing plants</b></mark>, which can cut down on germination time and the length between seeding and harvest.</h4></li>
    <hr>
</ul>

In [None]:
df.head()

In [None]:
print("Shape of the dataframe: ",df.shape)
df.isna().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes


<h1><b>2 <span style='color:#00bd35;'>|</span> üéØüìäEDA-Exploratory data analysisüìàüìâ</b></h1>

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>2.1 | Data distribution</b></p>
</div>


In [None]:
sns.displot(x=df['N'], bins=20,kde=True,edgecolor="black",color='black',facecolor='#ffb03b')
plt.title("Nitrogen",size=20)
plt.show()

In [None]:
sns.displot(x=df['P'],bins=20,color='black',edgecolor='black',kde=True,facecolor='#ffb03b')
plt.title("Phosphorus", size=20)
plt.xticks(range(0,150,20))
plt.show()

In [None]:
sns.displot(x=df['K'],kde=True, bins=20, facecolor='#ffb03b',edgecolor='black', color='black')
plt.title("Potassium",size=20)
plt.show()

In [None]:
sns.displot(x=df['temperature'], bins=20,kde=True,edgecolor="black",color='black',facecolor='#ffb03b')
plt.title("Temperature",size=20)
plt.show()

In [None]:
sns.displot(x=df['humidity'], color='black',facecolor='#ffb03b',kde=True,edgecolor='black')
plt.title("Humidity",size=20)
plt.show()

In [None]:
sns.displot(x=df['rainfall'], color='black',facecolor='#ffb03b',kde=True,edgecolor='black')
plt.title("Rainfall",size=20)
plt.show()

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>2.2 | Categorical plot </b></p>
</div>


In [None]:
sns.relplot(x='rainfall',y='temperature',data=df,kind='scatter',hue='label',height=5)
plt.show()

In [None]:
sns.pairplot(data=df,hue='label')
plt.show()

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>2.3 | Outerlier detection using graphs</b></p>
</div>


In [None]:
# Unique values in the label column

crops = df['label'].unique()
print(len(crops))
print(crops)
print(pd.value_counts(df['label']))

In [None]:
# Filtering each unique label and store it in a list df2 for to plot the box plot

df2=[]
for i in crops:
    df2.append(df[df['label'] == i])
df2[1].head()

In [None]:
sns.catplot(data=df, x='label', y='temperature', kind='box', height=10, aspect=20/8.27)
# plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
sns.catplot(data=df, x='label', y='humidity', kind='box', height=10, aspect=20/8.27)
# plt.xticks(rotation='vertical')
plt.title("Humidity", size=20)
plt.show()

In [None]:
sns.catplot(data=df, x='label', y='temperature', kind='box', height=10, aspect=20/8.27)
plt.show()


In [None]:
sns.catplot(data=df, x='label', y='N', kind='box', height=10, aspect=20/8.27)
# plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.catplot(data=df, x='label', y='ph', kind='box', height=10, aspect=20/8.27)
# plt.xticks(rotation='vertical')
plt.title("Nitrogen",size=20)
plt.show()

In [None]:
sns.catplot(data=df, x='label', y='P', kind='box', height=10, aspect=20/8.27)
# plt.xticks(rotation='vertical')
plt.title("Phosphorus",size=20)
plt.show()

In [None]:
sns.catplot(data=df, x='label', y='K', kind='box', height=10, aspect=20/8.27)
# plt.xticks(rotation='vertical')
plt.title("Potassium",size=20)
plt.show()

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:white;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;
              color:black;
              text-align:center;"><b>These graphs shows that thers is no outliers present in this dataset</b></p>
</div>

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>2.4 | Lets check through Mathematics (Statistics)</b></p>
</div>


In [None]:
def detect_outlier(x):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    IQR = q3-q1
    lower_limit = q1 - (1.5*IQR)
    upper_limit = q3 + (1.5*IQR)
    print(f"Lower limit: {lower_limit} Upper limit: {upper_limit}")
    print(f"Minimum value: {x.min()}   MAximum Value: {x.max()}")
    for i in [x.min(),x.max()]:
        if i == x.min():
            if lower_limit > x.min():
                print("Lower limit failed - Need to remove minimum value")
            elif lower_limit < x.min():
                print("Lower limit passed - No need to remove outlier")
        elif i == x.max():
            if upper_limit > x.max():
                print("Upper limit passed - No need to remove outlier")
            elif upper_limit < x.max():
                print("Upper limit failed - Need to remove maximum value")
detect_outlier(df['K'][df['label']=='grapes'])

In [None]:
for i in df['label'].unique():
    detect_outlier(df['K'][df['label']==i])
    print('---------------------------------------------')

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:white;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;
              color:black;
              text-align:center;"><b>These graphs shows that thers is no outliers present in this dataset and it is confirmed with the help of Statistics(IQR)</b></p>
</div>

<h1><b>3 <span style='color:#00bd35;'>|</span> üéØüìäPredictionüçÖü•≠üçéüçê</b></h1>

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>3.1 | Spliting the train and test data</b></p>
</div>


In [None]:
x = df.drop(['label'], axis=1)
x.head()

In [None]:
Y = df['label']
encode = preprocessing.LabelEncoder()
y = encode.fit_transform(Y)
print("Label length: ",len(y))

In [None]:
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y)
print(len(x_train),len(y_train),len(x_test),len(y_test))

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>3.2 | Best model choosing</b></p>
</div>
<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#636363;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><i>3.2.1 <b>|</b> Decision Tree, Support vector mechanism, Random forest</i></p>
</div>

In [None]:
a={'decision tree' : {
        'model' : DecisionTreeClassifier(criterion='gini'),
        'params':{'decisiontreeclassifier__splitter':['best','random']}
    },
    'svm': {
        'model': SVC(gamma='auto',probability=True),
        'params' : {
            'svc__C': [1,10,100,1000],
            'svc__kernel': ['rbf','linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'randomforestclassifier__n_estimators': [1,5,10]
        }
    },
   'k classifier':{
       'model':KNeighborsClassifier(),
       'params':{'kneighborsclassifier__n_neighbors':[5,10,20,25],'kneighborsclassifier__weights':['uniform','distance']}
   }
}

In [None]:
score=[]
details = []
best_param = {}
for mdl,par in a.items():
    pipe = make_pipeline(preprocessing.StandardScaler(),par['model'])
    res = model_selection.GridSearchCV(pipe,par['params'],cv=5)
    res.fit(x_train,y_train)
    score.append({
        'Model name':mdl,
        'Best score':res.best_score_,
        'Best param':res.best_params_
    })
    details.append(pd.DataFrame(res.cv_results_))
    best_param[mdl]=res.best_estimator_
pd.DataFrame(score)

In [None]:
details[0]

In [None]:
details[1]

In [None]:
details[2]

In [None]:
details[3]

In [None]:
score

In [None]:
pd.DataFrame(score)

In [None]:
for i in best_param.keys():
    print(f'{i} : {best_param[i].score(x_test,y_test)}')

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#636363;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><i>3.2.2 <b>|</b> Best model - Random forest</i></p>
</div>

In [None]:
predicted = best_param['random_forest'].predict(x_test)
predicted

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(confusion_matrix(y_test,predicted),annot=True)
plt.xlabel("Original")
plt.ylabel("Predicted")
plt.show()

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#636363;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><i>3.2.3 <b>|</b> Bagging classifier for more accuracy</i></p>
</div>

In [None]:
pipe1 = make_pipeline(preprocessing.StandardScaler(),RandomForestClassifier(n_estimators = 10))
bag_model = BaggingClassifier(base_estimator=pipe1,n_estimators=100,
                              oob_score=True,random_state=0,max_samples=0.8)

In [None]:
bag_model.fit(x_train,y_train)

In [None]:
bag_model.score(x_test,y_test)

In [None]:
predict = bag_model.predict(x_test)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(confusion_matrix(y_test,predict),annot=True)
plt.show()

<h1><b>4 <span style='color:#00bd35;'>|</span> üéäüéäüéä Conclusion üéäüéäüéä</b></h1>
<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>4.1 | Value mapping</b></p>
</div>
<h4 style= "line-height:20px;"> <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Value mapping</b></mark> shows that which value is belongs to which crop. It helps in easy reading the predicted value. Eg: If predicted value id <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>20</b></mark> then its belongs to <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Crop rice.</b></mark> So on...</h4>

In [None]:
dha2 =pd.DataFrame(Y)
code = pd.DataFrame(dha2['label'].unique())

In [None]:
dha = pd.DataFrame(y)
encode = pd.DataFrame(dha[0].unique())
refer = pd.DataFrame()
refer['code']=code
refer['encode']=encode
refer

<div style="color:white;
            display:fill;
            border-radius:8px;
            background-color:#2b2b2b;
            font-size:120%;
            font-family:fantasy;
            letter-spacing:0.5px">
    <p style="padding: 8px;color:white;"><b>4.2 | Report</b></p>
</div>
<h4 style="line-height:25px;"> Score for each crops. Want to know more more about <mark style="background-color:#00bd35;color:white;border-radius:4px;"><b>Precision and recall</b></mark> <br>- <a href="https://en.wikipedia.org/wiki/Precision_and_recall">Wikipedia</a><br> -
    <a href="https://www.youtube.com/watch?v=2osIZ-dSPGE&list=PLeo1K3hjS3uu7CxAacxVndI4bE_o3BDtO&index=19">Youtube</a></h4>

In [None]:
print(classification_report(y_test,predict))

<div style="display:fill;
            border-radius:15px;
            background-color:#00bd35;
            font-size:210%;
            font-family:cursive;
            letter-spacing:0.5px;
            padding:10px;
            color:white;
            border-style: solid;
            border-color: black;
            text-align:center;">
    <b>The End <br><p style = "text-align:center;font-size:20px; color:white"><i>Thank you for visiting</i></p></b>
</div>