## Loading dataset

In [None]:
from pkg.datasets import thin_sections

dataset = thin_sections(reduced_data=True)

## Weightening

In [None]:
import copy

weights = [{'feature': 'Porosity', 'weight': 10},
           {'feature': 'Main/single size mode(mm):', 'weight': 10}]

weightened_dataset = copy.deepcopy(dataset)

for weight in weights:
    index_weight = dataset.feature_names.index(weight['feature'])

    for i, point in enumerate(weightened_dataset.data):
        weightened_dataset.data[i][index_weight] = point[index_weight] * weight['weight']

    

## Analysis Per Feature

In [None]:
from matplotlib import pyplot as plt

# %matplotlib inline
for feature_name in dataset.feature_names:
    index_porosity = dataset.feature_names.index(feature_name)

    porosities = [[] for _ in dataset.target_names]

    for i, point in enumerate(dataset.data):
        porosities[dataset.target[i]].append(point[index_porosity])

    fig = plt.figure(figsize=(12,6))
    plt.title(feature_name)
    plt.boxplot(porosities, labels=dataset.target_names)
    plt.show()

## Plotting results
Plots are generated from the results obtained

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
from matplotlib import pyplot as plt

distances = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
methods = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']
    
labels = [dataset.target_names[numeric_label] for numeric_label in dataset.target]

# for distance in distances:
#     for method in methods:
#         if (method == 'centroid' or method == 'median' or method == 'ward') and distance != 'euclidian':
#             continue
            
#         fig = plt.figure(figsize=(18, 6))
#         plt.subplot(1, 2, 1)
#         Z = linkage(weightened_thin_sections.data, method=method, metric=distance)
#         plt.title('WEIGHTENED Method: ' + method + '; Distance: ' + distance)
#         dendrogram(Z, labels=labels, leaf_font_size=12)
#         plt.subplot(1, 2, 2)
#         Z = linkage(dataset.data, method=method, metric=distance)
#         plt.title('Method: ' + method + '; Distance: ' + distance)
#         dendrogram(Z, labels=labels, leaf_font_size=12)
#         plt.show()

### R section

In [None]:
data = dataset.data
data_labels = dataset.target_names[dataset.target]

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R -i data,data_labels -w 1000 -h 600

library(dendextend)

rownames(data) = data_labels

# DISTANCE=c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")
DISTANCE=c("euclidean", "manhattan")
# METHOD=c("single", "complete", "average", "mcquitty", "ward.D", "ward.D2", "centroid", "median")
METHOD=c("complete")
scenarios = expand.grid(DISTANCE=DISTANCE,METHOD=METHOD)

clustering <- function(distance, method){
    dend <- data %>%
            dist(method=distance) %>%
            hclust(method=method, members=NULL) %>%
            as.dendrogram() %>%
            color_branches(k=10) %>%
            color_labels(k=10)
    plot(dend)
    title(main=paste(distance, method, sep=" X "),
          ylab="distance", xlab="label")
}

mapply(clustering, scenarios$DISTANCE, scenarios$METHOD)
print("END")