In [4]:
# Load the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the data from the csv file
df = pd.read_csv('../data/gardening_crops.csv')
df.head()

Unnamed: 0,Temperature,PH,Soil,Waterlevel,Space,Label
0,31.123421,6.5,Well-drained Sandy Soil,High,0.025,Raddish
1,31.023871,6.0,Well-drained Sandy Soil,High,0.026,Raddish
2,29.075632,7.0,Well-drained Carbonic Soil,Moderate,0.027,Raddish
3,31.076125,7.5,Well-drained Sandy Soil,High,0.028,Raddish
4,31.165421,6.8,Well-drained Sandy Soil,High,0.029,Raddish


# We are going to create a KNN model that groups the plants (Label Column) based on the features (all other columns)

In [6]:
# Check the data types
df.dtypes

Temperature    float64
PH             float64
Soil            object
Waterlevel      object
Space          float64
Label           object
dtype: object

In [7]:
# Here, we can see that the Soil, Waterlevel, and the Label columns are of type object.
# Let's see if we can convert them to numeric values.
# We can use the unique() method to see the unique values in the column.

df['Soil'].unique()

array(['Well-drained Sandy  Soil', 'Well-drained Carbonic Soil',
       'Well drained Carbonic Soil', 'Loamy Sandy Soil',
       'Well drained Loamy Sandy Soil',
       'Well-drained Deep Fertile Loamy Soil ',
       'Well drained Loamy Soil ', 'Well-drained Loamy Soil ',
       'rich humus well drained carbonic Soil',
       'Well drained carbonic soil', 'Well drained Loamy Soil',
       'Well drained Loamy  Soil', 'Well drained Red Podzolic soil ',
       'Well drained  Sandy Loamy Soil', 'Well drained Sandy Loamy Soil',
       'Well drained Carbonic Loamy Sandy Soil',
       'Well drained  Loamy Sandy Soil',
       'Well drained Carbonic Loamy Clay Soil',
       'Well drained Loamy Clay Soil', 'Moisturized Soil',
       'Well drained acidic soil ', 'Well drained soil soil ',
       'Well drained soil', 'Well drained soil ',
       'Well drained  acidic soil', 'Well drained  soil',
       'Well drained   soil', 'Light Well drained  Soil',
       'Well drained Carbonic soil', 'Well dr

In [8]:
df['Waterlevel'].unique()

array(['High', 'Moderate', 'Low'], dtype=object)

In [11]:
# Our features are all of the columns except the Label column.
features = df.columns[:-1]
df_features = df[features]

df_features.head()

Unnamed: 0,Temperature,PH,Soil,Waterlevel,Space
0,31.123421,6.5,Well-drained Sandy Soil,High,0.025
1,31.023871,6.0,Well-drained Sandy Soil,High,0.026
2,29.075632,7.0,Well-drained Carbonic Soil,Moderate,0.027
3,31.076125,7.5,Well-drained Sandy Soil,High,0.028
4,31.165421,6.8,Well-drained Sandy Soil,High,0.029


In [13]:
# Create dummy variables for Soil and Waterlevel and append them to the df_features dataframe.
df_dummies = pd.get_dummies(df[['Soil', 'Waterlevel']])
df_features = pd.concat([df_features, df_dummies], axis=1)
df_features.drop(['Soil', 'Waterlevel'], axis=1, inplace=True)
df_features.head()

Unnamed: 0,Temperature,PH,Space,Soil_Carbonic Loamy Sandy Soil,Soil_Light Well drained Soil,Soil_Loamy Sandy Soil,Soil_Moisturized Soil,Soil_Well drained soil,Soil_Well drained Loamy Sandy Soil,Soil_Well drained Sandy Loamy Soil,...,Soil_Well drained soil.1,Soil_Well drained soil soil,Soil_Well-drained Carbonic Soil,Soil_Well-drained Deep Fertile Loamy Soil,Soil_Well-drained Loamy Soil,Soil_Well-drained Sandy Soil,Soil_rich humus well drained carbonic Soil,Waterlevel_High,Waterlevel_Low,Waterlevel_Moderate
0,31.123421,6.5,0.025,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
1,31.023871,6.0,0.026,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
2,29.075632,7.0,0.027,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,31.076125,7.5,0.028,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
4,31.165421,6.8,0.029,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False


In [14]:
# Convert all of the boolean values to numeric values.
df_features = df_features.astype('int64')
df_features.head()

Unnamed: 0,Temperature,PH,Space,Soil_Carbonic Loamy Sandy Soil,Soil_Light Well drained Soil,Soil_Loamy Sandy Soil,Soil_Moisturized Soil,Soil_Well drained soil,Soil_Well drained Loamy Sandy Soil,Soil_Well drained Sandy Loamy Soil,...,Soil_Well drained soil.1,Soil_Well drained soil soil,Soil_Well-drained Carbonic Soil,Soil_Well-drained Deep Fertile Loamy Soil,Soil_Well-drained Loamy Soil,Soil_Well-drained Sandy Soil,Soil_rich humus well drained carbonic Soil,Waterlevel_High,Waterlevel_Low,Waterlevel_Moderate
0,31,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,31,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,29,7,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,31,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,31,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
