# Water Quality Classification using Machine Learning

This notebook explores the water potability dataset and builds a machine learning model to classify whether water is safe for drinking.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
sns.set(style='whitegrid')

In [None]:
# Load dataset
df = pd.read_csv('../data/water_potability.csv')
df.head()

In [None]:
# Check missing values
df.info()
print(df.isnull().sum())

# Fill missing values with median
df.fillna(df.median(), inplace=True)

In [None]:
# Histograms
df.hist(bins=20, figsize=(14, 10))
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Example boxplot
sns.boxplot(x='Potability', y='ph', data=df)
plt.title('pH Levels by Water Potability')
plt.show()

In [None]:
# Train-test split
X = df.drop('Potability', axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Potable', 'Potable'], 
            yticklabels=['Not Potable', 'Potable'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()