# 1. Zbiór danych
<p>
Zbiór danych dotyczący zdatności wody do picia. Zawiera 20 cech, które przedstawiają zawartość poszczególnych związków chemicznych, pierwiastków i mikroorganizmów oraz cechę określającą zdatność do spożycia.
</p>
https://www.kaggle.com/mssmartypants/water-quality

## Wczytanie zbioru

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt

data = pd.read_csv('datasets/waterQuality1.csv', delimiter=',')
data = data[data.is_safe != '#NUM!']
data = data[data.ammonia != '#NUM!']
data['ammonia'] = pd.to_numeric(data['ammonia'])
data.head()

## Usuwanie losowych wartości

In [None]:
for index, row in data.iterrows():
    if index % 10 == 0:
        random_col = np.random.choice(data.columns)
        data.at[index, random_col] = np.nan

## Zastąpienie brakujących wartości średnią

In [None]:
means = data.mean(axis=0)
for index, value in means.items():
    data[index].fillna(value=value, inplace=True)

## Zastąpienie brakujących wartości medianą

In [None]:
medians = data.median(axis=0)
for index, value in medians.items():
    data[index].fillna(value=value, inplace=True)

## Zastąpienie najczęściej występujacą wartością

In [None]:
value = data['is_safe'].value_counts().idxmax()
data['is_safe'].fillna(value=value, inplace=True)

# 2. Statystyki opisowe i podsumowujące

In [None]:
data.describe()

In [None]:
data.info()

## Zależności między zmiennymi
Scatter ploty między każdą parą zmiennych. Na przekątnej wykres gęstości prawdopodobieństwa (rozkład) zmiennej.

In [None]:
import seaborn as sns

sns.pairplot(data)
plt.show()

#### Zmiennej podejrzane o relacje

In [None]:
sns.pairplot(data[['bacteria', 'viruses']])
plt.show()

## Tabela korelacji

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(16, 9), dpi=80)

corrMatrix = data.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

## Histogramy

In [None]:
print(data.columns)

data.hist(figsize=(30, 30))
plt.show()

Zmiennej podejrzane o relacje

In [None]:
sns.pairplot(data[['bacteria', 'viruses']])
plt.show()

## Boxploty

In [None]:
data.boxplot(figsize=(30, 15))
plt.show()

#### Odfiltrowanie wartości odstających

In [None]:
from scipy import stats
filtered = data[(np.abs(stats.zscore(data['aluminium'])) < 3)]
filtered = filtered[(np.abs(stats.zscore(filtered['arsenic'])) < 3)]
filtered = filtered[(np.abs(stats.zscore(filtered['nitrites'])) < 3)]

filtered.boxplot(figsize=(30, 15))
plt.show()
filtered.info()

In [None]:
filtered.info()

# 3. Skalowanie cech

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(data[['aluminium', 'ammonia', 'arsenic', 'barium', 'cadmium', 'chloramine', 'chromium', 'copper', 'flouride', 'bacteria', 'viruses', 'lead', 'nitrates', 'nitrites', 'mercury', 'perchlorate', 'radium', 'selenium', 'silver', 'uranium']])
data_scaled = scaler.transform(filtered[['aluminium', 'ammonia', 'arsenic', 'barium', 'cadmium', 'chloramine', 'chromium', 'copper', 'flouride', 'bacteria', 'viruses', 'lead', 'nitrates', 'nitrites', 'mercury', 'perchlorate', 'radium', 'selenium', 'silver', 'uranium']])

data_scaled = pd.DataFrame(np.append(data_scaled, filtered[['is_safe']].to_numpy(), axis=1), dtype=float)
data_scaled.columns = data.columns

data_scaled.boxplot(figsize=(30, 9))
plt.show()
data_scaled.describe()

In [None]:
data.info()

# 4. Redukcja wymiarowości
### Podział train-test

In [None]:
from sklearn.model_selection import train_test_split

X = data_scaled.loc[:, data_scaled.columns[:-1]].values
y = data_scaled.loc[:,['is_safe']].values.flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
y_train.shape, y_test.shape
np.unique(y_train, return_counts=True)

### Selekcja cech
#### Sequenential Forward/Backward Selection

Możliwe metryki: (jeżeli nie podana, wybierana jest domyślna metryka z podanego kalsyfikatora, można też napisać własną funkcję score'ującą)

In [None]:
import sklearn
list(sklearn.metrics.SCORERS.keys())

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) # przykładowy klasyfikator

sfs = SequentialFeatureSelector(
    knn,                           # klasyfikator
    n_features_to_select=18,       # liczba oczekwianych cech
    direction='forward',           # 'backward' || 'forward'
    scoring=None)  # metoda obliczania jakości modelu

sfs.fit(X_train, y_train)
reduced = sfs.transform(X_train)
reduced.shape, sfs.get_support()

### Ekstrakcja cech
#### Algorytm PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=18)
pca.fit(X_train)
principalComponents = pca.transform(X_train)
principalComponents.shape, pca.explained_variance_ratio_.sum()

# 5. Modele decyzyjne

## Generowanie próbek brakującej klasy

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X_train_OS, y_train_OS = oversample.fit_resample(X_train, y_train)
principalComponents_OS = pca.transform(X_train_OS)
reduced_OS = sfs.transform(X_train_OS)

In [None]:
np.unique(y_train_OS, return_counts=True)

## Regresja logistyczna

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#PCA
lr = LogisticRegression(random_state=0).fit(principalComponents, y_train)
X_test_transformed = pca.transform(X_test)
y_pred = lr.predict(X_test_transformed)
result = metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred)
print('PCA: ', result)
metrics.plot_confusion_matrix(lr, X_test_transformed, y_test)
plt.show()

#PCA + oversampling
lr = LogisticRegression(random_state=0).fit(principalComponents_OS, y_train_OS)
X_test_transformed = pca.transform(X_test)
y_pred = lr.predict(X_test_transformed)
result = metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred)
print('PCA+oversampling: ', result)
metrics.plot_confusion_matrix(lr, X_test_transformed, y_test)
plt.show()



#SFS
lr = LogisticRegression(random_state=0).fit(reduced, y_train)
X_test_reduced = sfs.transform(X_test)
y_pred = lr.predict(X_test_reduced)
result = metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred)
print('SFS: ', result)
metrics.plot_confusion_matrix(lr, X_test_reduced, y_test)
plt.show()

#SFS + oversampling
lr = LogisticRegression(random_state=0).fit(reduced_OS, y_train_OS)
X_test_reduced = sfs.transform(X_test)
y_pred = lr.predict(X_test_reduced)
result = metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred)
print('SFS+oversampling: ', result)
metrics.plot_confusion_matrix(lr, X_test_reduced, y_test)

## Algroytm k-najbliższych sąsiadów

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#PCA
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(principalComponents, y_train)
X_test_transformed = pca.transform(X_test)
y_pred = neigh.predict(X_test_transformed)

print("PCA:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(neigh, X_test_transformed, y_test)
plt.show()

#PCA + oversampling
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(principalComponents_OS, y_train_OS)
X_test_transformed = pca.transform(X_test)
y_pred = neigh.predict(X_test_transformed)

print("PCA+oversampling:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(neigh, X_test_transformed, y_test)
plt.show()



#SFS
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(reduced, y_train)
X_test_reduced = sfs.transform(X_test)
y_pred = neigh.predict(X_test_reduced)

print("SFS:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(neigh, X_test_reduced, y_test)
plt.show()

#SFS + oversampling
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(reduced_OS, y_train_OS)
X_test_reduced = sfs.transform(X_test)
y_pred = neigh.predict(X_test_reduced)

print("SFS + oversampling:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(neigh, X_test_reduced, y_test)

## Maszyna wektorów nośnych

In [None]:
from sklearn.svm import LinearSVC

#PCA
svc = LinearSVC(max_iter=5000)
svc.fit(principalComponents, y_train)
X_test_transformed = pca.transform(X_test)
y_pred = svc.predict(X_test_transformed)

print("PCA:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(svc, X_test_transformed, y_test)
plt.show()

#PCA + balanced
svc = LinearSVC(class_weight='balanced', max_iter=5000)
svc.fit(principalComponents, y_train)
X_test_transformed = pca.transform(X_test)
y_pred = svc.predict(X_test_transformed)

print("PCA + balanced:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(svc, X_test_transformed, y_test)
plt.show()

#PCA + oversampling
svc = LinearSVC(max_iter=5000)
svc.fit(principalComponents_OS, y_train_OS)
X_test_transformed = pca.transform(X_test)
y_pred = svc.predict(X_test_transformed)

print("PCA + oversampling:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(svc, X_test_transformed, y_test)
plt.show()



#SFS
svc =  LinearSVC(max_iter=5000)
svc.fit(reduced, y_train)
X_test_reduced = sfs.transform(X_test)
y_pred = svc.predict(X_test_reduced)

print("SFS:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(svc, X_test_reduced, y_test)
plt.show()

#SFS + balanced
svc =  LinearSVC(class_weight='balanced', max_iter=5000)
svc.fit(reduced, y_train)
X_test_reduced = sfs.transform(X_test)
y_pred = svc.predict(X_test_reduced)

print("SFS + balanced:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(svc, X_test_reduced, y_test)
plt.show()


#SFS + oversampling
svc =  LinearSVC(class_weight='balanced', max_iter=5000)
svc.fit(reduced_OS, y_train_OS)
X_test_reduced = sfs.transform(X_test)
y_pred = svc.predict(X_test_reduced)

print("SFS + oversampling:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(svc, X_test_reduced, y_test)
plt.show()

## Drzewo decyzyjne

In [None]:
from sklearn import tree

#PCA
dt = tree.DecisionTreeClassifier()
dt.fit(principalComponents, y_train)
X_test_transformed = pca.transform(X_test)
y_pred = dt.predict(X_test_transformed)

print("PCA:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(dt, X_test_transformed, y_test)
plt.show()

#PCA + oversampling
dt = tree.DecisionTreeClassifier()
dt.fit(principalComponents, y_train)
X_test_transformed = pca.transform(X_test)
y_pred = dt.predict(X_test_transformed)

print("PCA + oversampling:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(dt, X_test_transformed, y_test)
plt.show()



#SFS
dt = tree.DecisionTreeClassifier()
dt.fit(reduced, y_train)
X_test_reduced = sfs.transform(X_test)
y_pred = dt.predict(X_test_reduced)

print("SFS:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(dt, X_test_reduced, y_test)
plt.show()

#SFS + oversampling
dt = tree.DecisionTreeClassifier()
dt.fit(reduced, y_train)
X_test_reduced = sfs.transform(X_test)
y_pred = dt.predict(X_test_reduced)

print("SFS + oversampling:", metrics.accuracy_score(y_test, y_pred), metrics.f1_score(y_test, y_pred) )
metrics.plot_confusion_matrix(dt, X_test_reduced, y_test)
plt.show()

### Podsumowanie
Najlepszy model pod względem accuracy score i F1 score: Decission tree + SFS + oversampling
