<a href="https://colab.research.google.com/github/cyberdreams8/water-quality-analysis-and-prediction-2/blob/main/PDS_waterquality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing the necessary liabraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##Reading the Dataset

In [4]:
Dataset = pd.read_csv("water_quality_data.csv")

Example of Thresholds (WHO and general guidelines):
These are examples and may vary based on your local standards.

pH: 6.5 - 8.5


Turbidity: ≤ 5 NTU


Conductivity: ≤ 1500 µS/cm


Chloramines: ≤ 4 mg/L


Trihalomethanes: ≤ 80 µg/L


Summary
Critical Features: pH, turbidity, conductivity, chloramines, and THMs are often considered essential for assessing potability.
Secondary Features: Hardness and sulfate can impact taste and non-health-related qualities but are less critical for immediate safety assessments.



Conclusion
For a robust assessment of water potability, focus primarily on pH, turbidity, conductivity, chloramines, and THMs. Other factors may still be relevant but can be considered supplementary based on local regulations and health standards.
```
# This is formatted as code
```



In [33]:
# Define threshold conditions for potability
conditions = (
    (Dataset['ph'] >= 6.5) & (Dataset['ph'] <= 8.5) &
    (Dataset['Turbidity'] <= 5) &
    (Dataset['Conductivity'] <= 1500) &
    (Dataset['Chloramines'] <= 4) &
    (Dataset['Trihalomethanes'] <= 80)
)

# Assign "Drinkable" and "Not Drinkable" based on conditions
Dataset['Potability'] = conditions.replace({True: 'Drinkable', False: 'Not Drinkable'})

# Save the updated file
Dataset.to_csv('water_quality_data_with_potability.csv', index=False)

In [35]:
Dataset.head()

Unnamed: 0,Station Code,Station Name,STATE,Temperature Min,Temperature Max,Year,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,15.0,"WELL AT KUYYURA, A.P.",ANDHRA PRADESH,23.0,28.0,2012,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,Not Drinkable
1,16.0,WELL AT TADAVAI A.P.,ANDHRA PRADESH,27.0,28.0,2012,3.71608,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,Not Drinkable
2,26.0,"WELL AT VIJAYWADA, A.P.",ANDHRA PRADESH,26.0,32.0,2012,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,Not Drinkable
3,27.0,"WELL AT PEDDAVOORA, A.P.",ANDHRA PRADESH,26.0,27.0,2012,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,Not Drinkable
4,1513.0,"B W. - KRISHNA MURTHY, D.NO. 48-16-43 AUTONAGA...",ANDHRA PRADESH,25.0,30.0,2012,9.092223,181.101509,17978.98634,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,Not Drinkable


In [34]:
Dataset.tail()

Unnamed: 0,Station Code,Station Name,STATE,Temperature Min,Temperature Max,Year,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
6311,2533.0,SURI TOWN NEAR BUS STAND,WEST BENGAL,27.0,30.0,2021,8.75856,174.728686,25564.76756,8.070255,,371.915171,14.943699,47.185566,4.444376,Not Drinkable
6312,1773.0,"TANGRA, CALCUTTA , WEST\nBENGAL",WEST BENGAL,28.0,31.0,2021,6.744064,158.534831,16304.52892,7.133913,,514.748525,9.333898,38.54231,3.534126,Not Drinkable
6313,1774.0,"TOPSIA CALCUTTA, WEST\nBENGAL",WEST BENGAL,29.0,31.0,2021,5.524297,157.142424,26220.17421,6.344585,379.788377,475.342271,16.90318,51.106614,4.033391,Not Drinkable
6314,2546.0,ULUBERIA COLLEGE AT HOWRAH,WEST BENGAL,26.0,30.0,2021,6.966835,158.043065,12256.67829,7.105619,371.517435,382.443863,14.40808,78.100735,2.957441,Not Drinkable
6315,2535.0,VISVA BHARATI,WEST BENGAL,30.0,31.0,2021,5.665266,162.890083,31883.01031,8.312484,350.681732,550.817564,15.632268,72.457385,4.571824,Not Drinkable


##Sanity Check of Data

In [36]:
Dataset.shape

(6316, 16)

In [37]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6316 entries, 0 to 6315
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Station Code     6240 non-null   float64
 1   Station Name     6267 non-null   object 
 2   STATE            6241 non-null   object 
 3   Temperature Min  5818 non-null   float64
 4   Temperature Max  5818 non-null   float64
 5   Year             6316 non-null   int64  
 6   ph               4704 non-null   float64
 7   Hardness         5998 non-null   float64
 8   Solids           6316 non-null   float64
 9   Chloramines      5872 non-null   float64
 10  Sulfate          3986 non-null   float64
 11  Conductivity     6316 non-null   float64
 12  Organic_carbon   5998 non-null   float64
 13  Trihalomethanes  5388 non-null   float64
 14  Turbidity        6123 non-null   float64
 15  Potability       6316 non-null   object 
dtypes: float64(12), int64(1), object(3)
memory usage: 789.6+ KB


In [25]:
#finding missing values
Dataset.isnull().sum()

Unnamed: 0,0
Station Code,76
Station Name,49
STATE,75
Temperature Min,498
Temperature Max,498
Year,0
ph,1612
Hardness,318
Solids,0
Chloramines,444


In [38]:
#finding missing values in percentage
Dataset.isnull().sum()/Dataset.shape[0]*100

Unnamed: 0,0
Station Code,1.203293
Station Name,0.775807
STATE,1.18746
Temperature Min,7.884737
Temperature Max,7.884737
Year,0.0
ph,25.522483
Hardness,5.034832
Solids,0.0
Chloramines,7.029766


In [39]:
#finding duplicates
Dataset.duplicated().sum()

0

In [40]:
#identifying garbage values
for i in Dataset.select_dtypes(include='object').columns:
  print(Dataset[i].value_counts())
  print("***"*10)

Station Name
BEGUSARAI                                                                                    10
PATNA                                                                                        10
FULBARI BARRAGE                                                                               9
NANGAL, PUNJAB                                                                                9
VISVA BHARATI                                                                                 9
                                                                                             ..
HAND PUMP WATER NEAR D.M OFFICE, MADHEPURA                                                    1
HAND PUMP WATER AT COLLECTRIATE OFFICE, MADHUBANI                                             1
TAP WATER INSIDE THE CAMPUS OF TOURIST CAFETERIA,\nNALANDA                                    1
HAND PUMP WATER AT CIVIL COURT, NALANDA                                                       1
GROUND WATER AT BHAGWAN\nIN

##Exploratory Data Analysis (EDA)

In [41]:
Dataset.columns

Index(['Station Code', 'Station Name', 'STATE', 'Temperature Min',
       'Temperature Max', 'Year', 'ph', 'Hardness', 'Solids', 'Chloramines',
       'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes',
       'Turbidity', 'Potability'],
      dtype='object')

In [42]:
#descriptive statistics
columns= ['Temperature Min',
       'Temperature Max', 'ph', 'Hardness', 'Solids', 'Chloramines',
       'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes',
       'Turbidity']
selected_data = Dataset[columns]
selected_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature Min,5818.0,25.243589,4.694067,0.0,23.0,26.0,28.0,67.0
Temperature Max,5818.0,27.175799,4.515805,0.0,25.0,28.0,30.0,70.0
ph,4704.0,7.086775,1.578961,0.0,6.098012,7.027297,8.05149,14.0
Hardness,5998.0,196.855747,32.569073,73.492234,177.181298,197.433707,217.254791,323.124
Solids,6316.0,21985.633064,8786.151508,320.942611,15426.91508,20922.15446,27267.98186,61227.19601
Chloramines,5872.0,7.101592,1.569499,0.352,6.106318,7.105619,8.093541,13.127
Sulfate,3986.0,333.912833,41.590145,180.206746,307.997631,333.07963,360.08684,481.030642
Conductivity,6316.0,425.397172,80.069132,181.483754,365.400809,420.830866,480.310714,753.34262
Organic_carbon,5998.0,14.307402,3.322211,2.2,12.079948,14.261743,16.576565,28.3
Trihalomethanes,5388.0,66.610871,15.963696,0.738,56.124024,66.693973,77.382841,124.0


In [43]:
selected_data2 = Dataset[['STATE','Potability']]
selected_data2.describe().T

Unnamed: 0,count,unique,top,freq
STATE,6241,53,RAJASTHAN,585
Potability,6316,2,Not Drinkable,6277


In [None]:
#histograms to unserstand the distribution
