<a href="https://colab.research.google.com/github/cyberdreams8/water-quality-analysis-and-prediction-2/blob/main/PDS_waterquality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing the necessary liabraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##Reading the Dataset

In [None]:
Dataset = pd.read_csv("https://raw.githubusercontent.com/cyberdreams8/water-quality-analysis-and-prediction-2/refs/heads/main/water_quality_dataset.csv")
Dataset

In [None]:
Dataset.columns

In [None]:
Dataset

In [None]:
Dataset.head()

In [None]:
Dataset.tail()

##Sanity Check of Data

In [None]:
Dataset.shape

In [None]:
Dataset.info()

In [None]:
#finding missing values
Dataset.isnull().sum()

In [None]:
#finding missing values in percentage
Dataset.isnull().sum()/Dataset.shape[0]*100

In [None]:
#finding duplicates
Dataset.duplicated().sum()

In [None]:
#identifying garbage values
for i in Dataset.select_dtypes(include='object').columns:
  print(Dataset[i].value_counts())
  print("***"*10)

##Exploratory Data Analysis (EDA)

In [None]:
Dataset.columns

In [None]:
#descriptive statistics
columns= ['Temperature Min', 'Temperature Max', 'pH Min', 'pH Max',
       'Conductivity (µmhos/cm) Min', 'Conductivity (µmhos/cm) Max',
       'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'BOD (mg/L)',
       'TDS (mg/L)', 'Hardness (mg/L)']
selected_data = Dataset[columns]
selected_data.describe().T

In [None]:
selected_data2 = Dataset[['STATE','Potability']]
selected_data2.describe().T

In [None]:
# prompt: upsample minority of potability

# Assuming 'Dataset' DataFrame from the previous code is available.

from imblearn.over_sampling import RandomOverSampler

# Separate features (X) and target variable (y)
X = Dataset.drop('Potability', axis=1)
y = Dataset['Potability']

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)  # You can change the random state

# Resample the dataset
X_resampled, y_resampled = ros.fit_resample(X, y)

# Create a new DataFrame with the resampled data
Dataset = pd.DataFrame(X_resampled, columns=X.columns)
Dataset['Potability'] = y_resampled

# Print the value counts of the 'Potability' column to verify the upsampling
print(Dataset['Potability'].value_counts())

# Now 'upsampled_dataset' contains the upsampled data
# You can further use this dataset for model training etc.

In [None]:
Dataset

In [None]:
#histograms to understand the distribution
for i in selected_data.columns:
  sns.histplot(data=Dataset,x=i)
  plt.title(i)
  plt.show()


In [None]:
#Boxplot to identify Outliers
for i in selected_data.columns:
  sns.boxplot(data=Dataset,x=i)
  plt.title(i)
  plt.show()

In [None]:
# prompt: give a best plotting technique which compares each attribute to potability other than histplot and boxplot

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'Dataset' and 'selected_data' are defined as in the previous code.

# Create a list of attributes to compare with potability
attributes = ['pH Min', 'pH Max', 'Turbidity (NTU)', 'Conductivity (µmhos/cm) Min',
              'Conductivity (µmhos/cm) Max', 'Hardness (mg/L)', 'TDS (mg/L)']

# Plotting using violinplots
for attribute in attributes:
    plt.figure(figsize=(8, 6))  # Adjust figure size as needed
    sns.violinplot(x='Potability', y=attribute, data=Dataset)
    plt.title(f'{attribute} vs. Potability')
    plt.xlabel('Potability')
    plt.ylabel(attribute)
    plt.show()

In [None]:
#correlation with heatmap to interpret the relation and multicolliniarity
corr =selected_data.corr()
corr

In [None]:
plt.figure(figsize=(13,13))
sns.heatmap(corr, annot=True, cmap="YlGnBu") # Change 'YlGnBu' to your desired colormap
plt.show()

##MISSING VALUE TREATMENT

In [None]:
Dataset.isnull().sum()

In [None]:
columns_to_impute = ['Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'BOD (mg/L)', 'TDS (mg/L)', 'Hardness (mg/L)']

# Apply median imputation for each column
for column in columns_to_impute:
    median_value = Dataset[column].median()
    Dataset[column]=Dataset[column].fillna(median_value)

In [None]:
Dataset.isnull().sum()

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

knn_impute = ['Temperature Min', 'Temperature Max', 'pH Min', 'pH Max',
                     'Conductivity (µmhos/cm) Min', 'Conductivity (µmhos/cm) Max']

# Create a KNNImputer instance (using k=5 as a common choice)
imputer = KNNImputer(n_neighbors=5)

# Apply KNN imputation only to the specified columns
Dataset[knn_impute] = imputer.fit_transform(Dataset[knn_impute])


In [None]:
Dataset.isnull().sum()

In [None]:
Dataset = Dataset.dropna(subset=['Station Code'])
Dataset.isnull().sum()

In [None]:
Dataset

##Handling Outliers

In [None]:
for i in selected_data.columns:
  sns.boxplot(data=Dataset,x=i)
  plt.title(i)
  plt.show()

In [None]:
def wisker(col):
  q1,q3 = np.percentile(col,[25,75])
  iqr = q3-q1
  lower_bound = q1 - (1.5*iqr)
  upper_bound = q3 + (1.5*iqr)
  return lower_bound,upper_bound

In [None]:
Dataset.columns

In [None]:


for i in ['Temperature Min', 'Temperature Max', 'pH Min', 'pH Max',
          'Conductivity (µmhos/cm) Min', 'Conductivity (µmhos/cm) Max']:
    lower_bound, upper_bound = wisker(Dataset[i])

    # Use .loc to limit values to within the whisker range
    Dataset.loc[Dataset[i] > upper_bound, i] = upper_bound
    Dataset.loc[Dataset[i] < lower_bound, i] = lower_bound


In [None]:
for i in selected_data.columns:
  sns.boxplot(data=Dataset,x=i)
  plt.title(i)
  plt.show()

##Added columns like Temperature Mean,pH Mean and Conductivity Mean

In [None]:
Dataset = Dataset.assign(
       pH_Mean=(Dataset['pH Min'] + Dataset['pH Max']) / 2,
       Conductivity_Mean=(Dataset['Conductivity (µmhos/cm) Min'] + Dataset['Conductivity (µmhos/cm) Max']) / 2,
       Temperature_Mean=(Dataset['Temperature Min'] + Dataset['Temperature Max']) / 2
   )


In [None]:
Dataset

#DATA VISUALIZATION


# Assuming your data is loaded into a DataFrame called 'df'
# Replace this with your actual data loading code
# df = pd.read_csv('your_dataset.csv')

In [None]:




    # 1. Yearly Trend of Water Potability
    plt.figure(figsize=(12, 6))
    yearly_potability = Dataset.groupby(['Year', 'Potability']).size().unstack()
    yearly_potability_pct = yearly_potability.div(yearly_potability.sum(axis=1), axis=0) * 100
    yearly_potability_pct.plot(kind='bar', stacked=True)
    plt.title('Yearly Trend of Water Potability (2012-2021)')
    plt.xlabel('Year')
    plt.ylabel('Percentage')
    plt.legend(title='Potability')
    plt.grid(True)
    plt.show()




**Year-wise Quality Trend:**


1.Reveals long-term trends in water quality
2.Shows effectiveness of water management policies
3.Identifies periods of quality improvement or deterioration
4.Helps in future water quality predictions





In [None]:
 # 2. State-wise Water Quality Heatmap
plt.figure(figsize=(15, 8)) # Removed extra indentation here
state_params = Dataset.groupby('STATE')[['TDS (mg/L)', 'Hardness (mg/L)', 'Turbidity (NTU)', 'BOD (mg/L)']].mean()
sns.heatmap(state_params, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('State-wise Average Water Quality Parameters')
plt.tight_layout()
plt.show()


***State-wise Quality Comparison:***


1.Compares overall water quality across states
2.Identifies states needing immediate intervention
3.Shows regional patterns in water quality
4.Useful for resource allocation

In [None]:
    # 3. Temporal Changes in pH Levels
    plt.figure(figsize=(12, 6))
    yearly_ph = Dataset.groupby('Year')[['pH Min', 'pH Max']].mean()
    plt.plot(yearly_ph.index, yearly_ph['pH Min'], 'b-', label='Min pH')
    plt.plot(yearly_ph.index, yearly_ph['pH Max'], 'r-', label='Max pH')
    plt.fill_between(yearly_ph.index, yearly_ph['pH Min'], yearly_ph['pH Max'], alpha=0.2)
    plt.title('Temporal Changes in pH Levels (2012-2021)')
    plt.xlabel('Year')
    plt.ylabel('pH Value')
    plt.legend()
    plt.grid(True)
    plt.show()

***pH Range Analysis:***


1.Shows state-wise variations in pH levels
2.Identifies states with concerning pH ranges
3.Helps in targeted pH treatment planning
4.Useful for ecological impact assessment

In [None]:
    # 4. Box Plot of TDS Distribution by State
    plt.figure(figsize=(15, 6))
    sns.boxplot(x='STATE', y='TDS (mg/L)', data=Dataset)
    plt.xticks(rotation=45)
    plt.title('TDS Distribution by State')
    plt.tight_layout()
    plt.show()


In [None]:
    # 5. Yearly Trend of Average Conductivity
    plt.figure(figsize=(12, 6))
    yearly_conductivity = Dataset.groupby('Year')[['Conductivity (µmhos/cm) Min', 'Conductivity (µmhos/cm) Max']].mean()
    plt.plot(yearly_conductivity.index, yearly_conductivity['Conductivity (µmhos/cm) Max'],
             'ro-', label='Max Conductivity')
    plt.plot(yearly_conductivity.index, yearly_conductivity['Conductivity (µmhos/cm) Min'],
             'bo-', label='Min Conductivity')
    plt.title('Yearly Trend of Average Conductivity (2012-2021)')
    plt.xlabel('Year')
    plt.ylabel('Conductivity (μmhos/cm)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
    # 6. Dissolved Oxygen vs BOD Scatter Plot with Year Color Coding
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(Dataset['Dissolved Oxygen (mg/L)'], Dataset['BOD (mg/L)'],
                         c=Dataset['Year'], cmap='viridis')
    plt.colorbar(scatter, label='Year')
    plt.xlabel('Dissolved Oxygen (mg/L)')
    plt.ylabel('BOD (mg/L)')
    plt.title('Dissolved Oxygen vs BOD Relationship Over Years')
    plt.grid(True)
    plt.show()

In [None]:
    # 7. Temperature Range by State and Season
    plt.figure(figsize=(15, 6))
    state_temp = Dataset.groupby('STATE')[['Temperature Min', 'Temperature Max']].mean()
    x = np.arange(len(state_temp.index))
    width = 0.35
    plt.bar(x - width/2, state_temp['Temperature Min'], width, label='Min Temperature')
    plt.bar(x + width/2, state_temp['Temperature Max'], width, label='Max Temperature')
    plt.xlabel('State')
    plt.ylabel('Temperature (°C)')
    plt.title('Temperature Range by State')
    plt.xticks(x, state_temp.index, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
    # 8. Yearly Changes in Water Hardness
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Year', y='Hardness (mg/L)', data=Dataset)
    plt.title('Yearly Changes in Water Hardness')
    plt.xlabel('Year')
    plt.ylabel('Hardness (mg/L)')
    plt.grid(True)
    plt.show()

In [None]:
# 9. Correlation Matrix Over Time
plt.figure(figsize=(12, 8))
numeric_cols = ['Temperature Max', 'pH Max', 'Conductivity (µmhos/cm) Max',
               'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'BOD (mg/L)', 'TDS (mg/L)', 'Hardness (mg/L)']
# Replacing 'df' with 'Dataset' to access the DataFrame containing the data
correlation = Dataset[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Water Quality Parameters')
plt.tight_layout()
plt.show()

In [None]:
    # 10. State-wise Potability Analysis
    plt.figure(figsize=(15, 6))
    state_potability = Dataset.groupby('STATE')['Potability'].value_counts(normalize=True).unstack()
    state_potability.plot(kind='bar', stacked=True)
    plt.title('State-wise Water Potability Distribution')
    plt.xlabel('State')
    plt.ylabel('Percentage')
    plt.legend(title='Potability')
    plt.tight_layout()
    plt.show()

#ENCODING

#One-hot encode the 'State' column

In [None]:
Dataset['STATE'] = Dataset['STATE'].str.replace('Uttrakhand', 'Uttarakhand', case=False)

In [None]:
Dataset['STATE'] = Dataset['STATE'].str.title().str.strip()

In [None]:
Dataset = pd.get_dummies(Dataset, columns=["STATE"], drop_first=True)
Dataset

In [None]:
Dataset.columns

#Encoding Potability

In [None]:
Dataset['Potability'] = Dataset['Potability'].map({'Drinkable': 1, 'Not Drinkable': 0})
Dataset

#Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns for normalization, excluding identifiers or binary flags
numerical_columns = [
    'Temperature Min', 'Temperature Max', 'pH Min', 'pH Max',
    'Conductivity (µmhos/cm) Min', 'Conductivity (µmhos/cm) Max',
    'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'BOD (mg/L)', 'TDS (mg/L)', 'Hardness (mg/L)',
    'pH_Mean', 'Conductivity_Mean', 'Temperature_Mean'
]

# Initialize MinMaxScaler and normalize
scaler = MinMaxScaler()
Dataset[numerical_columns] = scaler.fit_transform(Dataset[numerical_columns])

# Confirm normalization by displaying the summary statistics
Dataset[numerical_columns].describe()


In [None]:
Dataset.isnull().sum()

##ML MODEL IMPLEMENTATION FOR POTABILITY

1)MODEL 1 - LOGISTIC REGRESSION

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split # Import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (X) and target (y)
X = Dataset.drop(columns=['Potability','Station Code', 'Station Name','Year'])
y = Dataset['Potability']

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logistic_reg_model = LogisticRegression(max_iter=1000)

# Train the model
logistic_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_reg_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)



In [None]:
# prompt: print accuracy percent

print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_rep)

2)MODEL 2 - DECISION TREES

In [None]:
# prompt: IMplement decision trees on above split

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=42)  # You can adjust hyperparameters here

# Train the model
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = decision_tree_model.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
classification_rep_dt = classification_report(y_test, y_pred_dt)




In [None]:
print(f"Decision Tree Accuracy: {accuracy_dt * 100:.2f}%")
print(classification_rep_dt)

3)MODEL 3 - RANDOM FOREST




In [None]:
# prompt: APPLY RANDOM FOREST ON ABOVE SPLIT

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier(random_state=42)  # You can adjust hyperparameters here

# Train the model
random_forest_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)




In [None]:
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_rep_rf)

4)MODEL 4 - Support Vector Machine (SVM)



In [None]:
# prompt: implement svm on above split

from sklearn.svm import SVC

# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)  # You can change the kernel (e.g., 'rbf', 'poly')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
classification_rep_svm = classification_report(y_test, y_pred_svm)



In [None]:
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print(classification_rep_svm)

5)MODEL 5 - K-Nearest Neighbors (KNN)

In [None]:
# prompt: implement knn

from sklearn.neighbors import KNeighborsClassifier
# Assuming X_train, X_test, y_train, y_test are already defined from previous code

# Initialize the KNN classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
classification_rep_knn = classification_report(y_test, y_pred_knn)



In [None]:
print(f"KNN Accuracy: {accuracy_knn * 100:.2f}%")
print(classification_rep_knn)

6)MODEL 6 - NAIVE BAYES

In [None]:
# prompt: implement naive bayes

from sklearn.naive_bayes import GaussianNB

# Initialize the Gaussian Naive Bayes model
naive_bayes_model = GaussianNB()

# Train the model
naive_bayes_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = naive_bayes_model.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
classification_rep_nb = classification_report(y_test, y_pred_nb)



In [None]:
print(f"Naive Bayes Accuracy: {accuracy_nb * 100:.2f}%")
print(classification_rep_nb)

In [None]:
# prompt: show a table with TWO COLUMNS MODEL USED AND THEIR ACCURACY

import pandas as pd

data = {
    'Model Used': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Support Vector Machine (SVM)', 'K-Nearest Neighbors (KNN)', 'Naive Bayes'],
    'Accuracy': [accuracy * 100, accuracy_dt * 100, accuracy_rf * 100, accuracy_svm * 100, accuracy_knn * 100, accuracy_nb * 100]
}

df = pd.DataFrame(data)
df

##ML MODEL IMPLEMENTATION FOR CALCULATION OF WATER QUALITY INDEX (WQI)

In [None]:
#Adding a column of Water Quality Index (WQI) based on other parameters
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd


# Define a function to calculate WQI
def calculate_wqi(df):
    weights = {
        'pH_Mean': 0.15,
        'Conductivity_Mean': 0.1,
        'Temperature_Mean': 0.1,
        'Turbidity (NTU)': 0.1,
        'Dissolved Oxygen (mg/L)': 0.2,
        'BOD (mg/L)': 0.15,
        'TDS (mg/L)': 0.1,
        'Hardness (mg/L)': 0.1
    }

    ideal_ranges = {
        'pH_Mean': (6.5, 8.5),
        'Conductivity_Mean': (0, 500),
        'Temperature_Mean': (0, 35),
        'Turbidity (NTU)': (0, 5),
        'Dissolved Oxygen (mg/L)': (5, 14),
        'BOD (mg/L)': (0, 3),
        'TDS (mg/L)': (0, 500),
        'Hardness (mg/L)': (0, 300)
    }

    def calculate_sub_index(value, param):
        min_val, max_val = ideal_ranges[param]
        if value < min_val:
            return 100
        elif value > max_val:
            return 0
        else:
            return ((value - min_val) / (max_val - min_val)) * 100

    # The following line was incorrectly indented, moved it to the correct level
    wqi_list = []
    for _, row in df.iterrows():
        wqi_score = 0
        for param, weight in weights.items():
            if param in row:
                sub_index = calculate_sub_index(row[param], param)
                wqi_score += weight * sub_index
        wqi_list.append(wqi_score)

    df['WQI'] = wqi_list
    return df

# Step 1: Select relevant parameters
params = ['pH_Mean', 'Conductivity_Mean', 'Temperature_Mean',
          'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)',
          'BOD (mg/L)', 'TDS (mg/L)', 'Hardness (mg/L)']
data_params = Dataset[params].dropna()

# Step 2: Standardize the data
scaler = StandardScaler() # Use the imported StandardScaler
data_scaled = scaler.fit_transform(data_params)

# Step 3: Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
Dataset['Cluster'] = kmeans.fit_predict(data_scaled) # Changed data to Dataset

# Step 4: Calculate WQI
data_with_wqi = calculate_wqi(Dataset) # Changed data to Dataset

# Step 5: Calculate average WQI per cluster
cluster_summary = data_with_wqi.groupby('Cluster')['WQI'].mean()

# Display the results
print(Dataset[['Station Code', 'Station Name', 'Cluster', 'WQI']].head())
print("Average WQI for each cluster:\n", cluster_summary)

In [None]:
Dataset

1) Model 1 : Linear Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define features (X) and target (y) assuming WQI is the target column
X = Dataset.select_dtypes(include=np.number).drop('WQI', axis=1)
y = Dataset['WQI']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)


2) Model 2 : Decision Trees

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Step 5: Create and train the Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)  # max_depth is adjustable
dt_model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = dt_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("\nSample Predictions:")
print(pd.DataFrame({'Actual WQI': y_test, 'Predicted WQI': y_pred}).head())

3) Model 3 : Random Forest Regression

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Step 4: Create and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("\nSample Predictions:")
print(pd.DataFrame({'Actual WQI': y_test, 'Predicted WQI': y_pred}).head())#

4) Model 4 : Support Vector Resgression (SVR)

In [None]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Step 5: Create and train the SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = svr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("\nSample Predictions:")
print(pd.DataFrame({'Actual WQI': y_test, 'Predicted WQI': y_pred}).head())

5) Model 5 : K-Nearest Neighbors Regression


In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors
knn_model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("\nSample Predictions:")
print(pd.DataFrame({'Actual WQI': y_test, 'Predicted WQI': y_pred}).head())

6) Model 6 : Gradient Booster Regressor

In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Step 5: Create and train the Gradient Boosting model
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = gbr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("\nSample Predictions:")
print(pd.DataFrame({'Actual WQI': y_test, 'Predicted WQI': y_pred}).head())

#ML MODEL IMPLEMENTATION FOR HEALTH RISK PREDICTION

Performing feature engineering and creatinf a health risk

In [None]:
# Adjusted function with broader thresholds to potentially yield more favorable classifications
def classify_health_risk_adjusted(row):
    # pH classification with adjusted thresholds
    if row['pH_Mean'] < 6.0 or row['pH_Mean'] > 9.0:
        pH_risk = 'High Risk'
    elif 7.0 <= row['pH_Mean'] <= 8.5:
        pH_risk = 'Low Risk'
    else:
        pH_risk = 'Moderate Risk'

    # TDS classification with adjusted thresholds
    if row['TDS (mg/L)'] > 700:
        TDS_risk = 'High Risk'
    elif row['TDS (mg/L)'] <= 300:
        TDS_risk = 'Low Risk'
    else:
        TDS_risk = 'Moderate Risk'

    # Turbidity classification with adjusted thresholds
    if row['Turbidity (NTU)'] > 10:
        turbidity_risk = 'High Risk'
    elif row['Turbidity (NTU)'] <= 3:
        turbidity_risk = 'Low Risk'
    else:
        turbidity_risk = 'Moderate Risk'

    # Combined risk level (simplified approach)
    if 'High Risk' in [pH_risk, TDS_risk, turbidity_risk]:
        return 'High Risk'
    elif 'Moderate Risk' in [pH_risk, TDS_risk, turbidity_risk]:
        return 'Moderate Risk'
    else:
        return 'Low Risk'

# Apply the adjusted classification function to create a 'Health Risk' column.
Dataset['Health Risk'] = Dataset.apply(classify_health_risk_adjusted, axis=1)

# Show the distribution of health risk categories after adjustment
Dataset['Health Risk'].value_counts()
