In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv('water_potability.csv')  

# Handle missing values by filling numeric columns with mean
df = df.fillna(df.mean(numeric_only=True))

# Features and target
X = df.drop('Potability', axis=1)
y = df['Potability']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (important for many classifiers)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Dictionary to store accuracy results
results = {}

# Train, predict and evaluate each model
for name, model in models.items():
    # Use scaled data for all except Decision Tree and Random Forest
    if name in ["Decision Tree", "Random Forest"]:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
    else:
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, preds)
    results[name] = acc
    
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))

# Summary of accuracies
print("\n\n=== Summary of Model Accuracies ===")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")



Logistic Regression Accuracy: 0.6280
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       412
           1       0.00      0.00      0.00       244

    accuracy                           0.63       656
   macro avg       0.31      0.50      0.39       656
weighted avg       0.39      0.63      0.48       656


Decision Tree Accuracy: 0.5579
              precision    recall  f1-score   support

           0       0.66      0.61      0.63       412
           1       0.42      0.47      0.44       244

    accuracy                           0.56       656
   macro avg       0.54      0.54      0.54       656
weighted avg       0.57      0.56      0.56       656



  _warn_prf(average, modifier, msg_start, len(result))



Random Forest Accuracy: 0.6738
              precision    recall  f1-score   support

           0       0.69      0.87      0.77       412
           1       0.61      0.34      0.43       244

    accuracy                           0.67       656
   macro avg       0.65      0.60      0.60       656
weighted avg       0.66      0.67      0.65       656


SVM Accuracy: 0.6951
              precision    recall  f1-score   support

           0       0.69      0.92      0.79       412
           1       0.70      0.32      0.44       244

    accuracy                           0.70       656
   macro avg       0.70      0.62      0.61       656
weighted avg       0.70      0.70      0.66       656


K-Nearest Neighbors Accuracy: 0.6280
              precision    recall  f1-score   support

           0       0.69      0.75      0.72       412
           1       0.50      0.42      0.46       244

    accuracy                           0.63       656
   macro avg       0.59      0.59   