In [12]:
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
import squarify

# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
import lightgbm as lgbm
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from yellowbrick.classifier import DiscriminationThreshold

# Stats
import scipy.stats as ss
# from scipy import interp
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

# Time
from contextlib import contextmanager
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

ImportError: cannot import name 'interp' from 'scipy' (C:\Users\anmol\AppData\Local\Programs\Python\Python312\Lib\site-packages\scipy\__init__.py)

In [2]:
#getting the dataset 
data = pd.read_csv("soil_health_monitoring_system.csv")
data=data.drop(['State','City','Weather Condition'],axis=1)


In [3]:
# Convert to DataFrame
df = pd.DataFrame(data)
label_encoder = LabelEncoder()
df['Soil Health'] = label_encoder.fit_transform(df['Soil Health'])
df.head()

Unnamed: 0,Sample ID,Organic Matter (%),Nitrogen (ppm),Phosphorus (ppm),Potassium (ppm),pH Level,Microbial Activity (CFU/g),Soil Structure (1-5),Moisture Retention (%),Iron (ppm),Zinc (ppm),Temperature (°C),Soil Salinity (dS/m),Soil Compaction (g/cm³),Soil Health
0,1,2.6,39,29,241,7.3,886911.1,3,19.4,8.4,1.4,29,0.46,1.22,1
1,2,2.0,35,15,195,6.7,1384623.2,2,14.1,8.2,1.2,25,0.44,1.13,1
2,3,4.8,23,17,202,6.9,1230697.0,2,14.7,7.5,1.0,29,0.27,1.13,0
3,4,3.4,28,22,271,6.4,908546.2,3,14.6,8.2,0.8,29,0.31,1.1,1
4,5,2.1,28,29,256,6.3,903060.4,2,23.8,8.2,1.0,28,0.48,1.25,1


In [11]:
import pandas as pd
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.offline as py

# Prepare datasets based on Soil Health
D = data[data['Soil Health'] != 0]  # Healthy Soil
H = data[data['Soil Health'] == 0]   # Unhealthy Soil

# Function to calculate the median based on the soil health target variable
def median_target(var):
    temp = data[data[var].notnull()]
    temp = temp.groupby('Soil Health')[var].median().reset_index()
    return temp

# Function to plot the distribution of a selected variable
def plot_distribution(data_select, size_bin):
    D = data[data['Soil Health'] != 0][data_select].dropna()  # Healthy Soil
    H = data[data['Soil Health'] == 0][data_select].dropna()  # Unhealthy Soil

    # Check if both datasets are not empty
    if D.empty or H.empty:
        print(f"Warning: No data available for {data_select}.")
        return  # Exit the function if there's no data to plot

    hist_data = [D, H]
    group_labels = ['Healthy Soil', 'Unhealthy Soil']
    colors = ['#FFD700', '#7EC0EE']

    fig = ff.create_distplot(hist_data, group_labels, colors=colors, show_hist=True, bin_size=size_bin, curve_type='kde')
    fig['layout'].update(title=data_select)
    py.iplot(fig, filename='Density plot')

# Function to handle missing values for specific variables
def fill_missing_values(var, healthy_value, unhealthy_value):
    data.loc[(data['Soil Health'] == 0) & (data[var].isnull()), var] = unhealthy_value
    data.loc[(data['Soil Health'] == 1) & (data[var].isnull()), var] = healthy_value

# Update for each variable in the new dataset
variables = {
    'Nitrogen (ppm)': (20.0, 35.0),
    'Organic Matter (%)': (3.5, 5.0),
    'Phosphorus (ppm)': (15.0, 25.0),
    'Potassium (ppm)': (180.0, 220.0),
    'pH Level': (5.5, 6.5),  # Adjust these values as needed
    'Microbial Activity (CFU/g)': (50000, 80000),  # Example values
    'Soil Structure (1-5)': (2, 4),  # Example values
    'Moisture Retention (%)': (30, 50),  # Example values
    'Iron (ppm)': (10, 20),  # Example values
    'Zinc (ppm)': (1, 2),  # Example values
    'Temperature (°C)': (15, 20),  # Example values
    'Soil Salinity (dS/m)': (0.5, 1.0),  # Example values
    'Soil Compaction (g/cm³)': (1.2, 1.5)  # Example values
}

for var, (unhealthy_value, healthy_value) in variables.items():
    median_target(var)
    fill_missing_values(var, healthy_value, unhealthy_value)
    plot_distribution(var, 0)  # Adjust bin size as necessary

# Sample distribution plots for pairs of features
def plot_feat1_feat2(feat1, feat2):
    D = data[data['Soil Health'] != 0]  # Healthy Soil
    H = data[data['Soil Health'] == 0]  # Unhealthy Soil

    if D.empty or H.empty:
        print(f"Warning: No data available for features: {feat1} and {feat2}.")
        return  # Exit if there's no data to plot

    trace0 = go.Scatter(
        x=D[feat1],
        y=D[feat2],
        name='Healthy Soil',
        mode='markers', 
        marker=dict(color='#FFD700', line=dict(width=1))
    )

    trace1 = go.Scatter(
        x=H[feat1],
        y=H[feat2],
        name='Unhealthy Soil',
        mode='markers',
        marker=dict(color='#7EC0EE', line=dict(width=1))
    )

    layout = dict(title=f"{feat1} vs {feat2}",
                  yaxis=dict(title=feat2, zeroline=False),
                  xaxis=dict(title=feat1, zeroline=False))

    fig = dict(data=[trace0, trace1], layout=layout)
    py.iplot(fig)

# Call to plot feature pairs
plot_feat1_feat2('Nitrogen (ppm)', 'Organic Matter (%)')
plot_feat1_feat2('Potassium (ppm)', 'pH Level')

# Function to create a bar plot for categorical variables
def barplot(var_select, sub):
    tmp1 = data[data['Soil Health'] != 0]
    tmp2 = data[data['Soil Health'] == 0]
    
    tmp3 = pd.crosstab(data[var_select], data['Soil Health'])
    tmp3['% Healthy'] = tmp3[1] / (tmp3[1] + tmp3[0]) * 100

    trace1 = go.Bar(
        x=tmp1[var_select].value_counts().index,
        y=tmp1[var_select].value_counts().values,
        text=tmp1[var_select].value_counts().values,
        textposition='auto',
        name='Healthy Soil', 
        opacity=0.8, 
        marker=dict(color='gold', line=dict(color='#000000', width=1))
    )

    trace2 = go.Bar(
        x=tmp2[var_select].value_counts().index,
        y=tmp2[var_select].value_counts().values,
        text=tmp2[var_select].value_counts().values,
        textposition='auto',
        name='Unhealthy Soil', 
        opacity=0.8, 
        marker=dict(color='lightskyblue', line=dict(color='#000000', width=1))
    )

    trace3 = go.Scatter(
        x=tmp3.index,
        y=tmp3['% Healthy'],
        yaxis='y2',
        name='% Healthy', 
        opacity=0.6, 
        marker=dict(color='black', line=dict(color='#000000', width=0.5))
    )

    layout = dict(title=str(var_select) + ' ' + (sub),
                  xaxis=dict(),
                  yaxis=dict(title='Count'),
                  yaxis2=dict(range=[0, 75],
                              overlaying='y',
                              anchor='x',
                              side='right',
                              zeroline=False,
                              showgrid=False,
                              title='% Healthy'))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    py.iplot(fig)

# Sample barplot call
barplot('Soil Structure (1-5)', 'Soil Structure Analysis')

# Function to create pie chart
def plot_pie(var_select, sub):
    D = data[data['Soil Health'] != 0]
    H = data[data['Soil Health'] == 0]

    col = ['Silver', 'mediumturquoise', '#CF5C36', 'lightblue', 'magenta', '#FF5D73', '#F2D7EE', 'mediumturquoise']
    
    trace1 = go.Pie(values=D[var_select].value_counts(),
                    labels=D[var_select].value_counts().index,
                    textfont=dict(size=15), opacity=0.8,
                    hole=0.5,
                    hoverinfo="label+percent+name",
                    domain=dict(x=[.0, .48]),
                    name="Healthy Soil",
                    marker=dict(colors=col, line=dict(width=1.5)))

    trace2 = go.Pie(values=H[var_select].value_counts(),
                    labels=H[var_select].value_counts().index,
                    textfont=dict(size=15), opacity=0.8,
                    hole=0.5,
                    hoverinfo="label+percent+name",
                    marker=dict(line=dict(width=1.5)),
                    domain=dict(x=[.52, 1]),
                    name="Unhealthy Soil")

    layout = go.Layout(dict(title=var_select + " distribution by Soil Health <br>" + (sub),
                            annotations=[dict(text="Healthy Soil: " + str(len(D)),
                                              font=dict(size=13),
                                              showarrow=False,
                                              x=.22, y=-0.1),
                                         dict(text="Unhealthy Soil: " + str(len(H)),
                                              font=dict(size=13),
                                              showarrow=False,
                                              x=.8, y=-.1)]))

    fig = go.Figure(data=[trace1, trace2], layout=layout)
    py.iplot(fig)

# Sample pie chart call
plot_pie('Temperature (°C)', 'Temperature Distribution')




KeyError: 1