## PRZEGLĄD RÓŻNYCH KLASYFIKACJI

In [0]:
# K-NN - K NAJBLIŻSZYCH SĄSIADÓW
# KERNEL K-NN - DODAJE WAGI ZWIĄZANE Z ODLEGŁOŚIA DO SĄSIADÓW
# MLP - MULTILAYER PERCEPTRON
names = ["k-NN", "kernel k-NN", "MLP"]

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [0]:
classifiers = [KNeighborsClassifier(),
               KNeighborsClassifier(weights='distance'),
               MLPClassifier(solver='lbfgs', alpha=1e-5,
                             hidden_layer_sizes=(5, 2), random_state=1)]

In [0]:
# KOD POTRZEBNY DO ŁADNEJ WIZUALIZACJI WYNIKÓW RAZEM Z PRZYKŁADOWYMI DANYMI
from matplotlib.colors import ListedColormap
from sklearn.datasets import *
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

def draw_graph(names, classifiers):

    h = .02

    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                               random_state=None, n_clusters_per_class=1)
    rng = np.random.seed(2)
    X += 2 * np.random.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [make_moons(noise=0.3, random_state=None),
                make_circles(noise=0.2, factor=0.5, random_state=None),
                linearly_separable
                ]

    figure = plt.figure(figsize=(27, 9))
    i = 1
    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):
        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=.4, random_state=42)

        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        if ds_cnt == 0:
            ax.set_title("Input data")
        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
                   edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, x_max]x[y_min, y_max].
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot the training points
            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                       edgecolors='k')
            # Plot the testing points
            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                       edgecolors='k', alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            if ds_cnt == 0:
                ax.set_title(name)
            ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=15, horizontalalignment='right')
            i += 1

    plt.tight_layout()
    plt.show()

In [0]:
draw_graph(names, classifiers)

## PODSTAWOWE TRENOWANIE MODELU

In [0]:
import pandas
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn import datasets

In [0]:
# ZBIÓR DANYCH O IRYSACH (4 WYMIARY) SPOŚRÓD DOMYŚLNYCH TRENINGOWYCH ZBIORÓW SCIKIT-LEARN (tzw. "TOY DATASETS"),
#   OPIS TUTAJ: http://archive.ics.uci.edu/ml/datasets/Iris?ref=datanews.io
iris = datasets.load_iris()

In [0]:
# "TOY DATASETS" MAJĄ POLA, W KTÓRYCH SĄ JUŻ WYDZIELONE PARAMETRY ORAZ DANE DOCELOWE  
iris_parameters = iris.data # PARAMETRY - DŁUGOŚCI I SZEROKOŚCI PŁATKÓW ORAZ ŁODYGI IRYSÓW
iris_values = iris.target # WARTOŚCI - TYPY IRYSÓW (3 RÓŻNE)

In [0]:
# PRZYGOTOWANIE DO LOSOWEJ INICJALIZACJI ZBIORU TRENINGOWEGO I TESTOWEGO
np.random.seed(0)
indices = np.random.permutation(len(iris_parameters))
test_percent = 15.0
test_set_size = int(len(iris_parameters) * test_percent / 100.0)

In [0]:
# INICJALIZACJA ZBIORU TRENINGOWEGO I TESTOWEGO
iris_parameters_train = iris_parameters[indices[:-test_set_size]]
iris_values_train = iris_values[indices[:-test_set_size]]
iris_parameters_test = iris_parameters[indices[-test_set_size:]]
iris_values_test = iris_values[indices[-test_set_size:]]

In [0]:
# PRZYGOTOWANIE KLASYFIKATORA NAJBLIŻSZYCH SĄSIADÓW
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

In [0]:
# TRENOWANIE MODELU NA ZBIORACH TRENINGOWYCH
knn.fit(iris_parameters_train, iris_values_train)

In [0]:
# UŻYCIE MODELU NA ZBIORZE TESTOWYM
predicted_values = knn.predict(iris_parameters_test)

In [0]:
# PIERWSZYCH KILKA WARTOŚCI PRZEWIDZIANYCH PRZEZ MODEL NA ZBIORZE TESTOWYM
predicted_values[:30]

In [0]:
# PIERWSZYCH KILKA WARTOŚCI RZECZYWISTYCH W ZBIORZE TESTOWYM
iris_values_test[:30]

In [0]:
# SPRAWDZENIE JAKOŚCI KLASYFIKACJI (PROCENT POPRAWNYCH ODPOWIEDZI)
correct = len([i for i, j in zip(predicted_values, iris_values_test) if i == j])
correct_percent = correct / len(predicted_values)
print(correct_percent)

## TRENOWANIE MODELU NA DANYCH WCZYTANYCH Z PLIKU
## ZNOWU UŻYJEMY DANYCH O IRYSACH

In [4]:
# WCZYTUJEMY DANE Z PLIKU ZA POMOCĄ PANDAS
iris_from_file = pandas.read_csv('iris_data')
print(type(iris_from_file))
iris_from_file

FileNotFoundError: ignored

In [0]:
# SPROWADZAMY RAMKĘ Z DANYMI BIBLIOTEKI PANDAS TO TABLICY NUMPY
iris_matrix = iris_from_file.as_matrix()
print(type(iris_matrix))
iris_matrix

In [0]:
# Z PLIKU WYODRĘBNIAMY TO, CO JEST ATRYBUTAMI I KOLUMNĘ, KTÓRA JEST WARTOŚCIĄ JAKO LISTY
iris_file_data = [flower[:4] for flower in iris_matrix]
iris_file_values = [flower[4] for flower in iris_matrix]
print(type(iris_file_data))
print(type(iris_file_values))
print(iris_file_data[:10])
print(iris_file_values[:10])

In [0]:
# ROBIMY Z TYCH LIST TABLICE NUMPY
iris_parameters = np.array(iris_file_data)
not_mapped_iris_values = np.array(iris_file_values)

In [0]:
# PRZEKSZTAŁCAMY TABLICĘ CIĄGÓW ZNAKÓW NA TABLICĘ ODPOWIADAJĄCYCH IM WARTOŚCI LICZBOWYCH
unique_flowers = np.unique(not_mapped_iris_values)
indices_map = {}
for i in range(0,len(unique_flowers)):
    indices_map[unique_flowers[i]] = i

iris_values = np.array([indices_map[name] for name in not_mapped_iris_values])
print(iris_values)

In [0]:
# TERAZ MAMY GOTOWE TAKIE SAME DANE JAK WCZEŚNIEJ, MOŻEMY POWTÓRZYĆ KLASYFIKACJĘ
np.random.seed(0)
indices = np.random.permutation(len(iris_parameters))
test_percent = 20.0
test_set_size = int(len(iris_parameters) * test_percent / 100.0)
iris_parameters_train = iris_parameters[indices[:-test_set_size]]
iris_values_train = iris_values[indices[:-test_set_size]]
iris_parameters_test = iris_parameters[indices[-test_set_size:]]
iris_values_test = iris_values[indices[-test_set_size:]]
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(iris_parameters_train, iris_values_train)
predicted_values = knn.predict(iris_parameters_test)

In [0]:
# SPRAWDZENIE JAKOŚCI KLASYFIKACJI
correct = len([i for i, j in zip(predicted_values, iris_values_test) if i == j])
correct_percent = correct / len(predicted_values)
print(correct_percent)

## WCZYTANIE DANYCH TEKSTOWYCH I WYTRENOWANIE MODELU NA NICH

In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

In [0]:
# ograniczymy się tylko do tych czterech kategorii dla uproszczenia
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [0]:
# używamy domyślnych danych z sklearn.datasets
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [0]:
print(dataset.data[1])

In [0]:
print(dataset.target[:30])

In [0]:
# przygotowujemy zbiory danych
set_data = np.array(dataset.data)
set_target = np.array(dataset.target)
indices = np.random.permutation(len(set_data))
test_percent = 15.0
test_set_size = int(len(set_data) * test_percent / 100.0)
parameters_train = set_data[indices[:-test_set_size]]
values_train = set_target[indices[:-test_set_size]]
parameters_test = set_data[indices[-test_set_size:]]
values_test = set_target[indices[-test_set_size:]]

In [0]:
count_vect = CountVectorizer()

In [0]:
arguments = count_vect.fit_transform(parameters_train)

In [0]:
# przygotowujemy argumenty do TF-IDF
tfidf_transformer = TfidfTransformer()
tfidf_arguments = tfidf_transformer.fit_transform(arguments)

In [0]:
# dopasowanie za pomocą naiwnego klasyfikatora bayes'owskiego
clf = MultinomialNB().fit(tfidf_arguments, values_train)

In [0]:
X_new_counts = count_vect.transform(parameters_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [0]:
predicted = clf.predict(X_new_tfidf)
actual = values_test

In [0]:
print(predicted[:30])
print(actual[:30])

In [0]:
correct = len([i for i, j in zip(predicted, actual) if i == j])
correct_percent = correct / len(predicted)
print(correct_percent)

## BEZ NORMALIZACJI - ZBIÓR 147-WYMIAROWY

In [0]:
pandas_data_from_file = pandas.read_csv('testing.csv')
matrix_data = pandas_data_from_file.as_matrix()
single_node_length = len(matrix_data[0])
arg_number = 0
list_arguments = [node[1:single_node_length] for node in matrix_data[:10000]]
list_values = [node[0] for node in matrix_data[:10000]]

values2 = np.array(list_values)

unique_values = np.unique(values2)
indices_map = {}
for j in range(0, len(unique_values)):
    indices_map[unique_values[j]] = j

arguments = np.array(list_arguments)
# values = np.array(values2)
values = np.array([indices_map[name] for name in values2])

data_len = len(arguments)

np.random.seed(0)

indices = np.random.permutation(data_len)

test_percent = 15.0

test_set_size = int(data_len * test_percent / 100.0)

parameters_train = arguments[indices[:-test_set_size]]
values_train = values[indices[:-test_set_size]]
parameters_test = arguments[indices[-test_set_size:]]
values_test = values[indices[-test_set_size:]]

knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(parameters_train, values_train)

predicted_values = knn.predict(parameters_test)
print("\nNUMBER OF DIMENSIONS: {}".format(single_node_length - 1))
correct = len([i for i, j in zip(predicted_values, values_test) if i == j])
correct_percent = correct / len(predicted_values)
print(correct_percent)

## BEZ NORMALIZACJI - ZBIÓR 34-WYMIAROWY

In [0]:
pandas_data_from_file = pandas.read_csv('dimcurs.data')
matrix_data = pandas_data_from_file.as_matrix()
single_node_length = len(matrix_data[0])
arg_number = 4

list_arguments = [np.array(node[:arg_number].tolist() + node[arg_number:single_node_length].tolist()) for node in matrix_data[:10000]]
list_values = [node[arg_number] for node in matrix_data[:10000]]

arguments = np.array(list_arguments)
values = np.array(list_values)

data_len = len(arguments)

np.random.seed(0)

indices = np.random.permutation(data_len)

test_percent = 15.0

test_set_size = int(data_len * test_percent / 100.0)

parameters_train = arguments[indices[:-test_set_size]]
values_train = values[indices[:-test_set_size]]
parameters_test = arguments[indices[-test_set_size:]]
values_test = values[indices[-test_set_size:]]

knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
knn.fit(parameters_train, values_train)

predicted_values = knn.predict(parameters_test)
print("\nNUMBER OF DIMENSIONS: {}".format(single_node_length - 1))
correct = len([i for i, j in zip(predicted_values, values_test) if i == j])
correct_percent = correct / len(predicted_values)
print(correct_percent)

In [0]:
def normalize(array):
    result = []
    maxmins = []

    for i in range(0, len(array[0])):
        maxmins.append({
            'max': max(
                [k for k in
                 [row[i] for row in array]
                 ]
            ),
            'min': min(
                [k for k in
                 [row[i] for row in array]
                 ]
            )
        }
        )

    print(maxmins)
    for row in array:
        new_row = []
        for j in range(len(row)):
            maximum = maxmins[j]['max']
            minimum = maxmins[j]['min']
            size = maximum - minimum
            value = row[j]
            normalized_value = (value - minimum) / size
            new_row.append(normalized_value)
        result.append(new_row)

    return np.array(result)

## NORMALIZACJA - ZBIÓR 147-WYMIAROWY

In [0]:
pandas_data_from_file = pandas.read_csv('testing.csv')
matrix_data = pandas_data_from_file.as_matrix()
single_node_length = len(matrix_data[0])
arg_number = 0
list_arguments = [node[1:single_node_length] for node in matrix_data[:10000]]
list_values = [node[0] for node in matrix_data[:10000]]

values2 = np.array(list_values)

unique_values = np.unique(values2)
indices_map = {}
for j in range(0, len(unique_values)):
    indices_map[unique_values[j]] = j

arguments = np.array(list_arguments)
values = np.array([indices_map[name] for name in values2])

# NORMALIZATION
arguments = normalize(arguments)


data_len = len(arguments)

np.random.seed(0)

indices = np.random.permutation(data_len)

test_percent = 15.0

test_set_size = int(data_len * test_percent / 100.0)

parameters_train = arguments[indices[:-test_set_size]]
values_train = values[indices[:-test_set_size]]
parameters_test = arguments[indices[-test_set_size:]]
values_test = values[indices[-test_set_size:]]

knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(parameters_train, values_train)

predicted_values = knn.predict(parameters_test)
print("\nNUMBER OF DIMENSIONS: {}".format(single_node_length - 1))
correct = len([i for i, j in zip(predicted_values, values_test) if i == j])
correct_percent = correct / len(predicted_values)
print(correct_percent)

## NORMALIZACJA - ZBIÓR 34-WYMIAROWY

In [0]:
pandas_data_from_file = pandas.read_csv('dimcurs.data')
matrix_data = pandas_data_from_file.as_matrix()
single_node_length = len(matrix_data[0])
arg_number = 4

list_arguments = [np.array(node[:arg_number].tolist() + node[arg_number:single_node_length].tolist()) for node in matrix_data[:10000]]
list_values = [node[arg_number] for node in matrix_data[:10000]]

arguments = np.array(list_arguments)
values = np.array(list_values)

# NORMALIZATION
arguments = normalize(arguments)

data_len = len(arguments)

np.random.seed(0)

indices = np.random.permutation(data_len)

test_percent = 15.0

test_set_size = int(data_len * test_percent / 100.0)

parameters_train = arguments[indices[:-test_set_size]]
values_train = values[indices[:-test_set_size]]
parameters_test = arguments[indices[-test_set_size:]]
values_test = values[indices[-test_set_size:]]

knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
knn.fit(parameters_train, values_train)

predicted_values = knn.predict(parameters_test)
print("\nNUMBER OF DIMENSIONS: {}".format(single_node_length - 1))
correct = len([i for i, j in zip(predicted_values, values_test) if i == j])
correct_percent = correct / len(predicted_values)
print(correct_percent)

##  PRZEGLĄD RÓŹNYCH METRYK LICZENIA ODLEGŁOŚCI

In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [0]:
# METRYKA EUKLIDESOWA: sqrt(sum((x - y)^2))
# METRYKA MINKOWSKIEGO: sum(|x - y|^p)^(1/p)
# METRYKA MANHATTAN: sum(|x - y|)
# METRYKA CZEBYSZEWA: max(|x - y|)
names = ["k-NN euclidean", "k-NN minkowski", "k-NN manhattan", "k-NN chebyshev"]

In [0]:
# KLASYFIKACJA ZA POMOCĄ RÓŹNYCH METRYK
classifiers = [KNeighborsClassifier(metric='euclidean'),
               KNeighborsClassifier(metric='minkowski', p=3),
               KNeighborsClassifier(metric='manhattan'),
               KNeighborsClassifier(metric='chebyshev')]

In [0]:
draw_graph(names, classifiers)

## RYSOWANIE HISTOGRAMU ODLEGŁOŚCI DLA IRYSÓW

In [0]:
import matplotlib.pyplot as plt
import pandas
import seaborn as sns
from sklearn.metrics.pairwise import euclidean_distances


iris_from_file = pandas.read_csv('iris_data')

iris_matrix = iris_from_file.as_matrix()
iris_file_data = [[flower[0], flower[1], flower[2], flower[3]] for flower in iris_matrix]
distances = []

for iris_parameter in iris_file_data:
    distances += euclidean_distances(iris_file_data, [iris_parameter]).tolist()

distances = [x[0] for x in distances if x[0] != 0.]

num_bins = 20
sns.set_style("whitegrid")
n, bins, patches = plt.hist(distances, num_bins, facecolor='blue', alpha=0.5)
plt.title('Histogram')
plt.ylabel('Number of elements')
plt.xlabel('Distance')
plt.show()

##RYSOWANIE HISTOGRAMU ODLEGŁOŚCI DLA ZBIORU 147 WYMIAROWEGO

In [None]:
import matplotlib.pyplot as plt
import pandas
import seaborn as sns
from sklearn.metrics.pairwise import euclidean_distances


data_from_file = pandas.read_csv('testing.csv')

data_matrix = data_from_file.as_matrix()
data_file_data = [
    [feature for feature in elem if feature != elem[0]] 
    for elem in data_matrix
]
distances = []

for data_parameter in data_file_data:
    distances += euclidean_distances(data_file_data, [data_parameter]).tolist()

distances = [x[0] for x in distances if x[0] != 0.]

num_bins = 40
sns.set_style("whitegrid")
n, bins, patches = plt.hist(distances, num_bins, facecolor='blue', alpha=0.5)
plt.title('Histogram')
plt.ylabel('Number of elements')
plt.xlabel('Distance')
plt.show()