In [1]:
import pandas as pd

In [2]:
file_path = '/content/cleaned_data_January.csv'
data = pd.read_csv(file_path)

In [3]:
data

Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,2,5.443762,1.744843e-02,1.572150,5.446719,5.290000e-76,0.424143,1,0.184143,0.353065,...,1.886856,3.797756e-02,570.054094,1,2.537141,44.891330,January,31.0,8.0,0
1,52,8.460833,1.726643e-02,2.261920,4.906773,1.170000e-256,1.281136,2,0.158578,0.416052,...,1.579333,5.709999e-10,13.925614,2,2.153062,82.674304,January,2.0,23.0,0
2,64,8.194406,3.304849e-03,2.234792,5.517628,2.410000e-71,1.096523,3,0.018634,0.304331,...,1.944491,2.219495e-05,297.621227,3,2.820591,35.653137,January,2.0,19.0,0
3,90,5.812626,1.061085e-04,1.394378,5.298739,4.120000e-114,0.668546,2,0.329479,0.028763,...,1.540985,5.108170e-02,188.786881,4,2.381271,41.814583,January,8.0,0.0,0
4,116,6.806017,7.272805e-02,2.039388,4.598857,2.420000e-90,1.663742,3,0.070714,0.404988,...,1.777328,6.074994e-03,479.485597,1,2.853797,74.757185,January,5.0,12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59785,1048529,7.786804,6.998581e-02,1.711585,5.392050,2.670000e-198,0.649898,3,0.000383,0.941883,...,2.031307,2.839999e-10,100.008291,3,2.856463,77.623536,January,11.0,22.0,0
59786,1048540,7.949245,4.639998e-08,1.222226,5.124759,4.160000e-93,0.005463,3,0.000394,0.159728,...,1.966285,2.080003e-11,463.738017,6,2.673157,63.273387,January,18.0,12.0,0
59787,1048554,7.228452,7.081002e-13,2.033834,4.988131,4.110000e-139,0.890923,2,0.216198,0.783007,...,1.481023,2.441092e-03,421.057598,1,2.620322,56.691924,January,10.0,4.0,0
59788,1048562,6.749023,2.859994e-07,1.861213,5.037623,1.130000e-69,0.062389,3,0.286575,0.824672,...,1.522859,7.901708e-03,12.503163,3,2.978230,50.339735,January,17.0,13.0,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


In [5]:
X = data.drop(columns = ['Month', 'Day', 'Time of Day', 'Target'])
y = data['Target']

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Classifier': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier()
}

In [10]:
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy

In [11]:
print("Model Accuracy Results:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy * 100:.2f}%")

Model Accuracy Results:
Random Forest: 98.86%
Support Vector Classifier: 87.34%
Logistic Regression: 80.48%
Decision Tree: 99.74%
