### The purpose of this script is to assess the potability of water samples

### Data source: https://raw.githubusercontent.com/amankharwal/Website-data/master/water_potability.csv

### 1. Data preparation and exploration

In [57]:
# Importing relevant libraries

import pandas as pd


In [58]:
# Loading the dataset

dataset = pd.read_csv("C:/Users/ilung/Desktop/data_analysis_project/water quality/water_quality_data.csv")

dataset


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135
1,3.716080,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075
...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681736,47580.99160,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821
3272,7.808856,193.553212,17329.80216,8.061362,,392.449580,19.903225,,2.798243
3273,9.419510,175.762646,33155.57822,7.350233,,432.044783,11.039070,69.845400,3.298875
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658


In [59]:
# We can observe from the dataset that some rows contain null values. Before proceeding with our analysis, we will remove all these NaN rows.

dataset = dataset.dropna() 

dataset

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075
5,5.584087,188.313324,28748.68774,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708
6,10.223862,248.071735,28749.71654,7.513408,393.663395,283.651634,13.789695,84.603556,2.672989
7,8.635849,203.361523,13672.09176,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425
...,...,...,...,...,...,...,...,...,...
3267,8.989900,215.047358,15921.41202,6.297312,312.931021,390.410231,9.899115,55.069304,4.613843
3268,6.702547,207.321086,17246.92035,7.708117,304.510230,329.266002,16.217303,28.878601,3.442983
3269,11.491011,94.812545,37188.82602,9.263166,258.930600,439.893618,16.172755,41.558501,4.369264
3270,6.069616,186.659040,26138.78019,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712


### 2. Assessing the fitness of the water samples for consumption based on drinking water quality standards

ph: 6.0 - 9.0   
Hardness: 50 - 150 mg/r   
Solids-TDS: 0-450 mg/r   
Chloramines: 0-4 mg/l   
Sulfate: 0-200 mg/l      
Conductivity: 0-700 uS/m   
Organic_carbon: 0-5 mg/C   
Trihalomethanes: 0-100 ppm   
Turbidity: 0 - 1 NTU

In [60]:
# Creating a list of index from the sample table

index_list = dataset.index.to_list()

# We are creating two lists. One where the index of suitable water samples will stored and the one where the unsuitable one will be stored

suitable = []

not_suitable = []

In [63]:
# Based on drinking water quality standards, we will be assessing whether or not these samples are fit for consumption
# For water to be fit for consumption, all the parameters must meet the standards.  If at least one parameter does not meet the standard, the sample will be deamed unfit for consumption

for index in index_list:

    if (
        (dataset["ph"][index] >= 6.0 and dataset["ph"][index] <= 9.0) 
        and (dataset["Hardness"][index] >= 50.0 and dataset["Hardness"][index] <= 150.0)
        and (dataset["Solids"][index] >= 0 and dataset["Solids"][index] <= 450.0)
        and (dataset["Chloramines"][index] >= 0 and dataset["Chloramines"][index] <= 4.0)
        and (dataset["Sulfate"][index] >= 0 and dataset["Sulfate"][index] <= 200.0)
        and (dataset["Conductivity"][index] >= 0 and dataset["Conductivity"][index] <= 700.0)
        and (dataset["Organic_carbon"][index] >= 0 and dataset["Organic_carbon"][index] <= 5.0)
        and (dataset["Trihalomethanes"][index] >= 0 and dataset["Trihalomethanes"][index] <= 100.0)
        and (dataset["Turbidity"][index] >= 0 and dataset["Turbidity"][index] <= 1.0)
        ):

        suitable.append(index)

    else:

        not_suitable.append(index)

In [62]:
# After running the analysis, we will quickly check whether or not we have samples that meet the drinking water standards

if suitable == [] :

    print ("All the samples are unfit for consumption")

else:

    print ("At least one sample is fit for consumption")

All the samples are unfit for consumption


### 3. Further analysis

Since all the samples are unfit for consumption, we will now check for each parameter, the proportion of samples that meet the drinking standards

In [83]:
# Converting all columns of the dataset to individual lists

ph_values = dataset["ph"].to_list()

hardness_values = dataset["Hardness"].to_list()

solids_values = dataset["Solids"].to_list()

chloramines_values = dataset["Chloramines"].to_list()

sulfate_values = dataset["Sulfate"].to_list()

conductivity_values = dataset["Conductivity"].to_list()

organic_carbon_values = dataset["Organic_carbon"].to_list()

trihalomethanes_values = dataset["Trihalomethanes"].to_list()

turbidity_values = dataset["Turbidity"].to_list()


Counting the proportion of samples meeting the drinking water standards for each parameters

In [84]:
# for ph

count_ph = 0

for x in ph_values:

        if x  >= 6.0 and x <= 9.0: 
                
                count_ph += 1

percent_ph = round ((count_ph/2011)*100, 2)

In [94]:
# for hardness

count_hardness = 0

for x in hardness_values:

        if x  >= 50.0 and x <= 150.0: 
                
                count_hardness += 1

percent_hardness = round ((count_hardness/2011)*100, 2)

In [95]:
# for solids

count_solids = 0

for x in solids_values:

        if x  >= 0 and x <= 450.0: 
                
                count_solids += 1

percent_solids = round ((count_solids/2011)*100, 2)

In [96]:
# for chloramines

count_chloramines = 0

for x in chloramines_values:

        if x  >= 0 and x <= 4: 
                
                count_chloramines += 1

percent_chloramines = round ((count_chloramines/2011)*100, 2)

In [97]:
# for sulfate

count_sulfate = 0

for x in sulfate_values:

        if x  >= 0 and x <= 200: 
                
                count_sulfate += 1

percent_sulfate = round ((count_sulfate/2011)*100, 2)

In [99]:
# for conductivity

count_conductivity = 0

for x in conductivity_values:

        if x  >= 0 and x <= 700.0: 
                
                count_conductivity += 1

percent_conductivity = round ((count_conductivity/2011)*100, 2)

In [100]:
# for organic_carbon

count_organic_carbon = 0

for x in organic_carbon_values:

        if x  >= 0 and x <= 5.0: 
                
                count_organic_carbon += 1

percent_organic_carbon = round ((count_organic_carbon/2011)*100, 2)

In [101]:
# for trihalomethanes

count_trihalomethanes = 0

for x in trihalomethanes_values:

        if x  >= 0 and x <= 100.0: 
                
                count_trihalomethanes += 1

percent_trihalomethanes = round ((count_trihalomethanes/2011)*100, 2)

In [102]:
# for turbidity

count_turbidity = 0

for x in turbidity_values:

        if x  >= 0 and x <= 1.0: 
                
                count_turbidity += 1

percent_turbidity = round ((count_turbidity/2011)*100, 2)

In [103]:
print (count_ph, "samples out of 2011 (",percent_ph, " %)",  " meet drinking water standard for ph")

print (count_hardness, "samples out of 2011 (",percent_hardness, " %)",  " meet drinking water standard for Hardness")

print (count_solids, "samples out of 2011 (",percent_solids, " %)",  " meet drinking water standard for Solids")

print (count_chloramines, "samples out of 2011 (",percent_chloramines, " %)",  " meet drinking water standard for Chloramines")

print (count_sulfate, "samples out of 2011 (",percent_sulfate, " %)",  " meet drinking water standard for Sulfate")

print (count_conductivity, "samples out of 2011 (",percent_conductivity, " %)",  " meet drinking water standard for Conductivity")

print (count_trihalomethanes, "samples out of 2011 (",percent_trihalomethanes, " %)",  " meet drinking water standard for Trihalomethanes")

print (count_turbidity, "samples out of 2011 (",percent_turbidity, " %)",  " meet drinking water standard for Turbidity")

1319 samples out of 2011 ( 65.59  %)  meet drinking water standard for ph
152 samples out of 2011 ( 7.56  %)  meet drinking water standard for Hardness
1 samples out of 2011 ( 0.05  %)  meet drinking water standard for Solids
59 samples out of 2011 ( 2.93  %)  meet drinking water standard for Chloramines
6 samples out of 2011 ( 0.3  %)  meet drinking water standard for Sulfate
2009 samples out of 2011 ( 99.9  %)  meet drinking water standard for Conductivity
1972 samples out of 2011 ( 98.06  %)  meet drinking water standard for Trihalomethanes
0 samples out of 2011 ( 0.0  %)  meet drinking water standard for Turbidity


End of the analysis