In [226]:
import numpy as np
import pandas as pd
import os
from PIL import Image
import tqdm
from global_kmeans_pp import global_clustering
from sklearn.decomposition import FastICA
from scipy.spatial import distance
from time import time
from seaborn import heatmap, color_palette
import warnings
import pickle
from scipy.stats import spearmanr, kendalltau, chi2_contingency

In [9]:
themes = os.listdir("themed_npys")

In [10]:
themes

['Лексика', 'Морфология', 'Морфонология', 'Синтаксис', 'Фонетика']

In [11]:
features = []
themes_features = {}

In [16]:
def preprocessing():
    def npy_to_array(t):
        file_name, theme = t
        with open(f"themed_npys/{theme}/{file_name}", 'rb') as f:
            themes_features[len(features)] = theme
            features.append(file_name.rstrip(".npy"))
            return np.load(f)

    lst_ll = []
    lst_thms = []
    lst_mps = []
    tmp_mps = []
    mp_nf = ""
    
    for theme in themes:
        tmp_thms = []
        
        lst_dr = os.listdir(f"themed_npys/{theme}")
        lst_dr = sorted(lst_dr, key=lambda x: (int(x.split(",")[0].lstrip("Том ")),
                                               float(x.split("карта ")[1].split(",")[0]),
                                               float(x.split("ареал ")[1].rstrip(".npy"))
                                              )
                       )
        
        for fl_nm in lst_dr:
            mt_tpl = fl_nm, theme
            tmp_thms.append(mt_tpl)

            lst_ll.append(mt_tpl)
            
            fl_nm_mp = fl_nm.split(", ареал")[0]
            if not mp_nf or fl_nm_mp == mp_nf:
                tmp_mps.append(mt_tpl)
            else:
                lst_mps.append(tmp_mps)
                tmp_mps = [mt_tpl]
            mp_nf = fl_nm_mp
        lst_thms.append(tmp_thms)
    lst_mps.append(tmp_mps)
    
    stck = np.stack(tuple(map(npy_to_array, lst_ll)), axis=2)
    
    total_area = np.array(Image.open('area.png'))
    zero_indices = np.where(total_area == 0)

    zeros = np.zeros(stck.shape[2])
    stck[zero_indices] = zeros
    
    stck = stck.reshape(stck.shape[0] * stck.shape[1], -1)

    positions = np.where(~np.all(stck == zeros, axis=1))[0]

    X = stck[positions]
    
    return stck, positions, X

In [17]:
stck, positions, X = preprocessing()

In [21]:
X = X.astype("int")

Градиент

In [22]:
ica = FastICA(n_components=3)
ica.fit(X)
data = ica.transform(X)

In [23]:
scaled_data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0)) * 255
rgb = scaled_data.astype("int")

In [24]:
rgb = np.hstack((rgb, np.full((X.shape[0], 1), 255)))

In [25]:
new_image = np.zeros((stck.shape[0], 4))
new_image[positions] = rgb
res = new_image.reshape((984, 969, 4))
res = Image.fromarray(res.astype(np.uint8))
res.save("gradient.png")

Кластеризация

In [211]:
n_clusters = 36

In [27]:
s = time()
model = global_clustering.GlobalKMeansPP(n_clusters=n_clusters, verbose=2)
model.fit(X)
print(time() - s)

Solving 2-means
Solving 3-means
Solving 4-means
Solving 5-means
Solving 6-means
Solving 7-means
Solving 8-means
Solving 9-means
Solving 10-means
Solving 11-means
Solving 12-means
Solving 13-means
Solving 14-means
Solving 15-means
Solving 16-means
Solving 17-means
Solving 18-means
Solving 19-means
Solving 20-means
Solving 21-means
Solving 22-means
Solving 23-means
Solving 24-means
Solving 25-means
Solving 26-means
Solving 27-means
Solving 28-means
Solving 29-means
Solving 30-means
Solving 31-means
Solving 32-means
Solving 33-means
Solving 34-means
Solving 35-means
Solving 36-means
11520.10983133316


In [28]:
clusters = model.labels_
cluster_centers = model.cluster_centers_
inertia = model.inertia_

In [29]:
with open('clusters.pkl', 'wb') as f:
    pickle.dump(clusters, f)

with open('cluster_centers.pkl', 'wb') as f:
    pickle.dump(cluster_centers, f)

with open('inertia.pkl', 'wb') as f:
    pickle.dump(inertia, f)

In [10]:
with open('clusters.pkl', 'rb') as f:
    clusters = pickle.load(f)

with open('cluster_centers.pkl', 'rb') as f:
    cluster_centers = pickle.load(f)

with open('inertia.pkl', 'rb') as f:
    inertia = pickle.load(f)

In [31]:
warnings.simplefilter("ignore", UserWarning)

In [32]:
all_cluster_centers = np.concatenate(list(cluster_centers.values()))

In [33]:
all_cluster_centers.shape

(666, 446)

In [34]:
data = ica.transform(all_cluster_centers)

In [35]:
scaled_data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0)) * 255
rgb = scaled_data.astype("int")

In [36]:
rgb = np.hstack((rgb, np.full((all_cluster_centers.shape[0], 1), 255)))

In [37]:
for i in tqdm.tqdm(range(1, n_clusters + 1)):
    nth_clusters = clusters[i]
    nth_rgb = rgb[sum(range(i)):sum(range(i + 1))]

    nth_clusters_rgb = [nth_rgb[i] for i in nth_clusters]

    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = nth_clusters_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"{i}_clusters.png")

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:18<00:00,  1.94it/s]


Близость кластеров

In [212]:
nth_clusters = clusters[n_clusters]
nth_cluster_centers = cluster_centers[n_clusters]

In [213]:
for i in tqdm.tqdm(range(n_clusters)):
    distances = np.apply_along_axis(lambda x: distance.euclidean(nth_cluster_centers[i], x), 1, nth_cluster_centers)
    norm_distances = (distances - distances.min()) / (distances.max() - distances.min())
    cmap = color_palette("inferno_r", as_cmap=True)
    heat_rgb = cmap(norm_distances) * 255
    
    heat_clusters_rgb = [heat_rgb[i] for i in nth_clusters]

    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = heat_clusters_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"proximity_to_{i}_out_of_{n_clusters}.png")

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:12<00:00,  2.87it/s]


Близость к прототипам

In [214]:
distances = np.array(list(map(lambda x: distance.euclidean(X[x], nth_cluster_centers[nth_clusters[x]]), range(X.shape[0]))))
norm_distances = (distances - distances.min()) / (distances.max() - distances.min())
cmap = color_palette("inferno_r", as_cmap=True)
heat_rgb = cmap(norm_distances) * 255

In [215]:
new_image = np.zeros((stck.shape[0], 4))
new_image[positions] = heat_rgb
res = new_image.reshape((984, 969, 4))
res = Image.fromarray(res.astype(np.uint8))
res.save("proximity_to_centres_of_clusters.png")

Границы кластеров

In [276]:
n_clusters = 25

In [277]:
nth_clusters = clusters[n_clusters]

In [278]:
arr = np.zeros((stck.shape[0]))
arr[positions] = nth_clusters + 1
arr = arr.reshape((984, 969))

In [279]:
bool_arr = (((np.roll(arr, shift=(1, 0), axis=(1, 1)) == arr) | np.logical_not(np.roll(arr, shift=(1, 0), axis=(1, 1)))) \
           & ((np.roll(arr, shift=(-1, 0), axis=(1, 1)) == arr) | np.logical_not(np.roll(arr, shift=(-1, 0), axis=(1, 1)))) \
           & ((np.roll(arr, shift=(1, 0), axis=(0, 1)) == arr) | np.logical_not(np.roll(arr, shift=(1, 0), axis=(0, 1)))) \
           & ((np.roll(arr, shift=(-1, 0), axis=(0, 1)) == arr) | np.logical_not(np.roll(arr, shift=(-1, 0), axis=(0, 1))))) \
           | np.logical_not(arr)

bool_arr = ~bool_arr

In [280]:
res = np.zeros(bool_arr.shape + (4,))
res[bool_arr] = np.array([255] * 4)

res = Image.fromarray(res.astype(np.uint8))
res.save("borders.png")

Важные признаки для кластеров

In [221]:
with open("areals_data.txt", "r", encoding="UTF-8") as file:
    areals_data = [_.strip("\n") for _ in file.readlines()]

In [265]:
I = 25

In [267]:
nth_clusters = clusters[I]
nth_cluster_centers = cluster_centers[I]

info = np.zeros((I, X.shape[-1]))

for n in range(I):
    wh = np.where(nth_clusters == n)
    cluster_size = len(wh[0])
    print(cluster_size)
    cluster_size = np.full(X.shape[-1], cluster_size) # размеры кластера
    intersection = X[wh].sum(axis=0) # размеры пересечений
    areal_sizes = X.sum(axis=0) # размеры ареалов
    union = areal_sizes + cluster_size - intersection # размеры объединений
    total_size = np.full(X.shape[-1], X.shape[0]) # размеры всего

    IoU = intersection / union
    info[n] = IoU
    
    top = sorted(enumerate(IoU), reverse=True, key=lambda x: x[1])
    for index, t in top[:10]:
        print(features[index], areals_data[index], t)
    print()
    
    distances = (X * IoU).sum(axis=1)
    distances = (distances - distances.min()) / (distances.max() - distances.min())
    
    cmap = color_palette("inferno", as_cmap=True)
    heat_rgb = cmap(distances) * 255
    
    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = heat_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"proximity_{I}_{cluster_size[0]}.png")

13384
Том 3, карта 7.2, ареал 5 [jе́т]ый ([jе]ный) Волог. Вытегор. 0.44039681902160815
Том 1, карта 11, ареал 8 употребление конструкций типа за водой 0.31258870281010503
Том 2, карта 18.1, ареал 2 пранник 0.2894927122578681
Том 3, карта 2.1, ареал 2 ры, лы, ли 0.276055524616876
Том 2, карта 14, ареал 3 бел. дзяжа́, дзёжка, дёжка (неполнота материала, см. комментарий) 0.27604413736207073
Том 1, карта 3, ареал 11 /и/: п[и], б[и] и т.п. или л\п[и], б[и] и т.п. и п[и]й, б[и]й и т.п. 0.2639467533787217
Том 1, карта 12, ареал 4 паха́ть в значении ‘бороновать после сева' 0.23936685689673737
Том 3, карта 12, ареал 1 употребляются формы вин.-род. п. типа бел. кароў, валоў, укр. коней, корiў, гостей 0.21786552533480602
Том 3, карта 7.1, ареал 3 тэй, тый 0.21777487733546033
Том 1, карта 3, ареал 5 /е/: молод[е́]й, глу[х'е́]й, глух[е́]й, и т.п. 0.21112132665917527

18126
Том 4, карта 7.3, ареал 11 рус. тепи́нка, цепи́нка; 0.5487286683572065
Том 4, карта 7.2, ареал 4 рус. держа́лень, держа́лка, де

Близость к набору признаков

In [227]:
Rog_cart = stck.reshape((984, 969, stck.shape[-1]))[649, 618]

In [228]:
Rog_corp = np.array(list("01000000000000001000001000000110000000000000000100000000000000000000000000000000000000000000000000010000000100000000000000010000000000001100010000010000100011111111101000000000000110000010000000000100000000000100010011000000000000001001000010000011101001001010001000110000010000000001010000000001000000000001001000000110001000000000010001000100000000000010000000010000010000000011001000000000000100000001000000100100100000010000010000001000010001")).astype("int")

In [259]:
distances = np.array(list(map(lambda x: np.array([distance.euclidean(X[x], Rog_cart), 
                                                  distance.euclidean(X[x], Rog_corp)]), 
                              range(X.shape[0]))))

In [260]:
norm_distances = (distances - distances.min()) / (distances.max() - distances.min())

In [261]:
for ind, comp in {0: "dot", 1: "corp"}.items():
    cmap = color_palette("inferno_r", as_cmap=True)
    heat_rgb = cmap(norm_distances[:, ind]) * 255

    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = heat_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"proximity_to_{comp}.png")

In [189]:
spearmanr(Rog_cart, Rog_corp)

SignificanceResult(statistic=0.6578012169312458, pvalue=1.2725211865789325e-56)

In [190]:
kendalltau(Rog_cart, Rog_corp)

SignificanceResult(statistic=0.6578012169312457, pvalue=8.81405271273025e-44)

In [193]:
contingency_table = pd.crosstab(Rog_cart, Rog_corp)

In [194]:
chi2, p, dof, expected = chi2_contingency(contingency_table)

In [195]:
np.sqrt(chi2 / Bud_cart.shape[0])

0.6498246170739501