In [44]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '50fc1c3a-5966-4d15-9498-27be4c202b00'
resource_group = 'ml-group'
workspace_name = 'ml-workspace'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='water_potability')
dfo = dataset.to_pandas_dataframe()

## Peak into the data

In [45]:
print(dfo.shape)
dfo.head()

(3276, 10)


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [35]:
dfo.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


## Sort into numeric, categorical and target value

In [36]:
numerical = list(dfo.columns[:-1])
target = [dfo.columns[-1]]

## Checking missing values

In [37]:
dfo.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

## Checking duplicated values

In [38]:
dfo.loc[dfo.duplicated(subset=numerical, keep=False)]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability


In [39]:
dfo.loc[dfo.duplicated(subset=(numerical + target), keep=False)]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability


## Fixing missing datapoints

In [61]:
df = dfo.copy()
dfg = df.groupby('Potability')

df['ph'] = dfg.ph.transform(lambda x: x.fillna(x.median()))
df['Sulfate'] = dfg.Sulfate.transform(lambda x: x.fillna(x.median()))
df['Trihalomethanes'] = dfg.Trihalomethanes.transform(lambda x: x.fillna(x.median()))

df.isna().sum()


ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [78]:
X_train.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,7.035456,202.569467,22256.950365,6.913501,304.951498,434.198436,13.696893,64.205667,4.47807
1,5.319769,204.202072,22390.795566,8.053559,315.027505,399.119025,13.974085,83.157997,2.925732
2,7.350379,193.633367,26736.085567,10.416589,309.416883,557.495685,16.519722,61.077383,3.663922
3,7.612517,225.492793,14812.970645,7.982595,356.690174,341.074925,14.994662,55.893916,2.939593
4,7.122637,183.402648,31295.90241,4.425026,333.389426,411.61607,14.066057,77.185792,3.750447


In [96]:
from imblearn.over_sampling import SMOTE
samp = SMOTE()
X = df.drop(['Potability'], axis=1)
y = df['Potability']

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#X_train, y_train = samp.fit_resample(X_train, y_train)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Train model

In [103]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

model_tree = DecisionTreeClassifier()
model_GBC = GradientBoostingClassifier()

cv_score_tree = cross_val_score(model_tree, X_train, y_train, scoring = 'accuracy', cv=5).mean()
cv_score_GBC = cross_val_score(model_GBC, X_train, y_train, scoring = 'accuracy', cv=5).mean()

print(f'Tree: {cv_score_tree}', f'GBC: {cv_score_GBC}', sep='\n')

Tree: 0.7268872220290431
GBC: 0.7851009222923186
