In [15]:
import pandas as pd
import numpy as np
import joblib 
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from azureml.core import Workspace, Experiment, Dataset, Datastore

# Load the dataset 
df = pd.read_csv("water_potability.csv")
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [18]:
# Check missing values
print("\nMising Value:\n",df.isnull().sum())


Mising Value:
 ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


In [19]:
# Selecting relevant features
features = ['ph', 'Solids', 'Turbidity']
clean_df = df[features].copy()

# Handle missing values (use mean imputation)
clean_df.fillna(df.mean(), inplace=True)

# Check missing values
print("\nMissing Values:\n", clean_df.isnull().sum())

clean_df


Missing Values:
 ph           0
Solids       0
Turbidity    0
dtype: int64


Unnamed: 0,ph,Solids,Turbidity
0,7.080795,20791.318981,2.963135
1,3.716080,18630.057858,4.500656
2,8.099124,19909.541732,3.055934
3,8.316766,22018.417441,4.628771
4,9.092223,17978.986339,4.075075
...,...,...,...
3271,4.668102,47580.991603,4.435821
3272,7.808856,17329.802160,2.798243
3273,9.419510,33155.578218,3.298875
3274,5.126763,11983.869376,4.708658


In [22]:
# Registering the dataset in the data store
ds = Datastore.get_default(ws)
ds

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-54e1527f-e158-4b3a-9560-4f60fbf85765",
  "account_name": "ttpjan25purefl4249766154",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [23]:
# Scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(clean_df)

In [24]:
# Train One-Class SVM for anomaly detection
oc_svm = OneClassSVM(kernel="rbf", gamma=0.1, nu=0.05)
oc_svm.fit(df_scaled)

In [25]:
# Predict anomalies (-1 = anomaly, 1 = normal)
anomaly_predictions = oc_svm.predict(df_scaled)

# Convert predictions to a DataFrame
clean_df['Anomaly'] = anomaly_predictions
clean_df['Anomaly'] = clean_df['Anomaly'].replace({1: 'Normal', -1: 'Anomaly'})

# Show results
print(clean_df.tail(20)) 

# Save the model and scaler
joblib.dump(oc_svm, "one_class_svm.pkl")
joblib.dump(scaler, "scaler.pkl")

             ph        Solids  Turbidity  Anomaly
3256   7.607224  39184.846720   3.525027   Normal
3257   6.683368  18989.316768   5.208061   Normal
3258   6.638411   9772.504814   5.164057   Normal
3259   9.271355  16540.979048   4.333953   Normal
3260   7.080795   9000.025591   3.699558   Normal
3261   3.629922  24856.633209   4.754826   Normal
3262   8.378108  28474.202580   4.524693   Normal
3263   6.923636  24792.525623   4.013339   Normal
3264   5.893103  20526.666156   4.390702   Normal
3265   8.197353  27701.794055   3.361833   Normal
3266   8.372910  14622.745494   4.906358   Normal
3267   8.989900  15921.412018   4.613843   Normal
3268   6.702547  17246.920347   3.442983   Normal
3269  11.491011  37188.826022   4.369264  Anomaly
3270   6.069616  26138.780191   3.669712   Normal
3271   4.668102  47580.991603   4.435821  Anomaly
3272   7.808856  17329.802160   2.798243   Normal
3273   9.419510  33155.578218   3.298875   Normal
3274   5.126763  11983.869376   4.708658   Normal


['scaler.pkl']