In [106]:
import pandas as pd
import numpy as np

from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.base import ClassifierMixin

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [107]:
testdf = pd.read_csv("data/water_quality_test.csv")
traindf = pd.read_csv("data/water_quality_train.csv")
testdf['Potability'] = testdf['Potability'].apply(lambda x: 1 if x == "Yes" else 0)
traindf['Potability'] = traindf['Potability'].apply(lambda x: 1 if x == "Yes" else 0 if x == "No" else x)

traindf
# traindfnotna = traindf.dropna()



Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,169.974849,23403.637304,8.519730,,475.573562,12.924107,50.861913,2.747313,1.0
1,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,
2,5.934279,223.858125,23249.653834,4.602850,,277.384537,11.366863,66.623944,5.217895,1.0
3,6.193170,225.683422,12883.726496,4.739360,,389.405978,14.560136,76.457816,3.466645,
4,,204.852187,13286.156575,9.093843,332.845544,365.687520,8.589896,76.306376,4.268525,
...,...,...,...,...,...,...,...,...,...,...
2759,8.347949,231.203101,13533.230033,7.623604,313.671127,527.380871,18.663901,58.052866,4.217861,
2760,6.836225,166.742962,24293.172663,6.861235,332.881582,390.804955,16.006991,87.101925,3.666614,
2761,6.385471,231.488458,26756.918000,8.774357,,422.067701,16.563116,73.260640,4.681717,
2762,6.490037,205.541517,19397.323889,7.894245,368.561073,479.786195,15.805085,56.982020,2.430883,


In [108]:
x = traindf.drop('Potability', axis=1)
x = x.fillna(x.median())
traindf = pd.concat((x, pd.DataFrame(traindf['Potability']).fillna(-1)), axis=1)

In [109]:
# Lista de clasificadores
classifierList: list[ClassifierMixin] = [
    RandomForestClassifier(),
    MLPClassifier(), 
    LogisticRegression()
]

In [110]:
sampleSize = traindf.shape[0] // len(classifierList)
trainSamples:list[tuple[pd.DataFrame, pd.DataFrame]] = []
for i in range(len(classifierList)):
    sample = traindf.sample(sampleSize)
    trainSamples.append((traindf.drop('Potability', axis=1), pd.DataFrame(traindf['Potability'])))

In [111]:
xTest = testdf.drop('Potability', axis=1)
yTest = pd.DataFrame(testdf['Potability'])

In [112]:
# Convertimos los clasificadores en semi-supervisados

classifierList: list[tuple[str, ClassifierMixin]] = [(str(c), SelfTrainingClassifier(c)) for c in classifierList]

In [113]:
for nc, sam in zip(classifierList, trainSamples):
    nc[1].fit(sam[0].to_numpy(), sam[1].to_numpy().transpose()[0])

In [114]:
scores = [c.score(xTest.to_numpy(), yTest.to_numpy()) for s, c in classifierList]
scores

[0.55078125, 0.50390625, 0.513671875]

In [119]:
predictions = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), 0, np.array([c.predict(xTest.to_numpy()) for s, c in classifierList]).astype(int))
predictions

array([1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,