 ## Initialize

In [None]:
!pip install --upgrade numpy
!pip install hdbscan
!pip install gensim
!pip install umap-learn

In [1]:
import gensim # for operations related to word embeddings
import hdbscan # clustering
import umap # dim reduction
import numpy as np
import pandas as pd
import sklearn


import sys; sys.path.insert(0, "../../../data_management/tools/") # To load functions from files in data_management/tools
from textlist_file import write_list, load_list # For saving and loading text lists to/from file


In [2]:
# Define dictionary file paths:
culture_path = "../../Dictionary Mapping/Dictionaries/Culture.csv"
relational_path = "../../Dictionary Mapping/Dictionaries/Relational.csv"
demographic_path = "../../Dictionary Mapping/Dictionaries/Demographic.csv"

# Define model paths
wem_path = "../../../models_storage/word_embeddings_data/word2vec_phrased_filtered_300d_aug14.bin"

## Load word2vec models

In [3]:
model = gensim.models.KeyedVectors.load(wem_path)

## Load and Clean Dictionaries

In [4]:
culture = []
relational = []
demographic = []

model_vocab = set(model.wv.vocab)

# Only include entries that are also in the current model
for item in load_list(culture_path):
    item = item.strip("\n").replace(",", " ")
    if item in model_vocab:
        culture.append(item.strip("\n").replace(",", " "))

for item in load_list(relational_path):
    item = item.strip("\n").replace(",", " ")
    if item in model_vocab:
        relational.append(item.strip("\n").replace(",", " "))

for item in load_list(demographic_path):
    item = item.strip("\n").replace(",", " ")
    if item in model_vocab:
        demographic.append(item.strip("\n").replace(",", " "))

culture = pd.DataFrame(culture, columns=["item"])
relational = pd.DataFrame(relational, columns=["item"])
demographic = pd.DataFrame(demographic, columns=["item"])

perspectives = [culture, relational, demographic]
perspective_names = ["culture", "relational", "demographic"]

print("""
Final dictionary lengths
Culture: {}
Relational: {}
Demographic: {}
""".format(len(culture), len(relational), len(demographic)))


Final dictionary lengths
Culture: 56
Relational: 108
Demographic: 57



In [5]:
# Look up embeddings from model
for perspective in perspectives:
    perspective["wem"] = perspective["item"].apply(lambda item: model.wv[item])

culture.head()

Unnamed: 0,item,wem
0,ambiguity,"[-0.040271938, -0.050914586, -0.08996192, 0.07..."
1,ambiguous,"[-0.04765881, 0.029175187, 0.14534585, 0.24998..."
2,appropriate,"[0.048566714, -0.003296685, -0.0069924053, 0.0..."
3,bureaucratization,"[0.26042837, -0.099836364, 0.009189517, 0.3966..."
4,ceremonially,"[0.112578005, -0.044181783, -0.33859155, 0.091..."


## Compute Cosine Distances
Cosine distance is a more appropriate measure of distance for word embeddings than Euclidian distances.

In [6]:
from sklearn.metrics.pairwise import pairwise_distances

# issue in hdbscan library requires cast to float64: https://github.com/scikit-learn-contrib/hdbscan/issues/71 
distance_matrices = [pairwise_distances(np.stack(p['wem']).astype(np.float64), metric='cosine') for p in perspectives]

Also, distances for combined dictionaries (sanity check on process)

In [7]:
combined = np.vstack([np.stack(p['wem']) for p in perspectives])
# redundant calculations, but these are fast
combined_distance_matrix = pairwise_distances(combined, metric='cosine').astype(np.float64)

## Fit Clusters
Using HDBScan for unsupervised clustering. This algorithm performs well with noisy data and clusters of varying shapes and densities.

In [8]:
clusterer = hdbscan.HDBSCAN(metric='precomputed')
for name, X, df in zip(perspective_names, distance_matrices, perspectives):
    clusterer.fit(X)
    df['label'] = clusterer.labels_
    print("Computed clusters for %s" % name)

Computed clusters for culture
Computed clusters for relational
Computed clusters for demographic


In [9]:
print("Word counts per cluster")
for name, df in zip(perspective_names, perspectives):
    print(name)
    print(df['label'].value_counts())
    print()

Word counts per cluster
culture
-1    41
 1     8
 0     7
Name: label, dtype: int64

relational
-1    81
 1    22
 0     5
Name: label, dtype: int64

demographic
-1    39
 1    10
 0     8
Name: label, dtype: int64



In [10]:
#TO DO: clean this up
i = 0
df = perspectives[i]
print("Clusters for the %s dictionary" % perspective_names[i])
print(df.groupby('label').get_group(0)["item"])
print(df.groupby('label').get_group(1)["item"])
print(df.groupby('label').get_group(-1)["item"])

Clusters for the culture dictionary
13         diffusion
17    homogenization
21         imitation
24     institutional
29       isomorphism
36           mimetic
38           mimicry
Name: item, dtype: object
25     institutionalize
26    institutionalized
30           legitimacy
31           legitimate
32          legitimated
33         legitimating
34         legitimation
45            normative
Name: item, dtype: object
0               ambiguity
1               ambiguous
2             appropriate
3       bureaucratization
4            ceremonially
5                coercion
6                coercive
7              conformist
8              conformity
9               decoupled
10             decoupling
11                diffuse
12               diffused
14                diverse
15              diversity
16            homogeneity
18             homogenize
19            homogenized
20                imitate
22             innovation
23            innovations
27             isomorphic
2

In [11]:
i = 1
df = perspectives[i]
print("Clusters for the %s dictionary" % perspective_names[i])
print(df.groupby('label').get_group(0)["item"])
print(df.groupby('label').get_group(1)["item"])

Clusters for the relational dictionary
1        acquisitions
42    diversification
46       diversifying
84             merger
85            mergers
Name: item, dtype: object
16                  comply
21             constraints
23                 control
27               cooperate
28              cooperated
31             cooperation
38              dependence
39            dependencies
59               influence
60              influenced
61              influences
62             influencing
64         interdependence
65       interdependencies
70              interlocks
71     interorganizational
72                 lobbied
73                lobbying
90                networks
94               pressured
95               pressures
103              sanctions
Name: item, dtype: object


In [12]:
i = 2
df = perspectives[i]
print("Clusters for the %s dictionary" % perspective_names[i])
print(df.groupby('label').get_group(0)["item"])
print(df.groupby('label').get_group(1)["item"])

Clusters for the demographic dictionary
23    institutionalization
24       institutionalized
25              legitimacy
26              legitimate
27             legitimated
28             legitimates
29            legitimating
30            legitimation
Name: item, dtype: object
6         dynamic
8         ecology
10      evolution
16     generalism
17     generalist
18    generalists
32          niche
33         niches
50     specialism
51     specialist
Name: item, dtype: object


In [13]:
clusterer = hdbscan.HDBSCAN(metric='precomputed')
clusterer.fit(combined_distance_matrix)
combined_labels = clusterer.labels_

In [14]:
for p in perspectives:
    dict_len = len(p)
    labels, combined_labels = combined_labels[:dict_len], combined_labels[dict_len:]
    p["combined_label"] = labels

In [15]:
print("Word counts per cluster")
for name, df in zip(perspective_names, perspectives):
    print(name)
    print(df['combined_label'].value_counts())
    print()

Word counts per cluster
culture
-1    43
 1    13
Name: combined_label, dtype: int64

relational
-1    75
 1    20
 2     7
 0     6
Name: combined_label, dtype: int64

demographic
-1    47
 1     9
 2     1
Name: combined_label, dtype: int64



## Clustering on Core Terms

In [30]:
df = pd.read_excel("core_terms.xlsx")
df.head()

Unnamed: 0,Demographic,Unnamed: 1,Relational,Unnamed: 3,Cultural
0,age dependence,,board directors,,ceremonial
1,birth rate,,buffer,,coercion
2,carrying capacity,,coalition,,coercive
3,chance survival,,constrain,,conform
4,competition,,constraint,,conformity


In [31]:
# Only include entries that are also in the current model
demographic_core = []
relational_core = []
cultural_core = []
for item in df["Demographic"]:
    if item in model_vocab:
        demographic_core.append(item)
for item in df["Relational"]:
    if item in model_vocab:
        relational_core.append(item)
for item in df["Cultural"]:
    if item in model_vocab:
        cultural_core.append(item)
        
cultural_core = pd.DataFrame(cultural_core, columns=["item"])
relational_core = pd.DataFrame(relational_core, columns=["item"])
demographic_core = pd.DataFrame(demographic_core, columns=["item"])

perspectives_core = [cultural_core, relational_core, demographic_core]
perspective_names_core = ["cultural_core", "relational_core", "demographic_core"]
        
print("""
Final core list lengths
Culture: {}
Relational: {}
Demographic: {}
""".format(len(cultural_core), len(relational_core), len(demographic_core)))


Final core list lengths
Culture: 31
Relational: 31
Demographic: 21



In [51]:
relational_core["item"].array

<PandasArray>
[         'buffer',       'coalition',       'constrain',      'constraint',
         'control',       'cooperate',     'cooperation',           'coopt',
      'cooptation',      'dependence',       'dependent', 'diversification',
       'diversify',       'dominance',        'exchange',        'external',
      'horizontal',       'influence', 'interdependence',  'interdependent',
       'interlock',    'interlocking',           'merge',          'merged',
          'merger',         'network',         'network',           'power',
        'pressure',        'sanction',        'vertical']
Length: 31, dtype: object

In [32]:
# Look up embeddings from model
for perspective in perspectives_core:
    perspective["wem"] = perspective["item"].apply(lambda item: model.wv[item])

cultural_core.head()

Unnamed: 0,item,wem
0,ceremonial,"[0.017785886, 0.17060874, -0.34285247, 0.24763..."
1,coercion,"[-0.10194491, -0.004256736, -0.10866293, 0.187..."
2,coercive,"[-0.09407789, 0.17365104, -0.0471853, 0.290502..."
3,conform,"[-0.09841556, 0.03873162, -0.0737745, -0.01913..."
4,conformity,"[-0.1256471, 0.09199774, -0.19100225, 0.051814..."


In [33]:
distance_matrices_core = [pairwise_distances(np.stack(p['wem']).astype(np.float64), metric='cosine') for p in perspectives_core]
combined_core = np.vstack([np.stack(p['wem']) for p in perspectives_core])
# redundant calculations, but these are fast
combined_distance_matrix_core = pairwise_distances(combined_core, metric='cosine').astype(np.float64)

In [20]:
# min_cluster_size of 3 is the largest value that generates results
clusterer = hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=3)
for name, X, df in zip(perspective_names_core, distance_matrices_core, perspectives_core):
    clusterer.fit(X)
    df['label'] = clusterer.labels_
    print("Computed clusters for %s" % name)

Computed clusters for cultural_core
Computed clusters for relational_core
Computed clusters for demographic_core


In [21]:
print("Word counts per cluster")
for name, df in zip(perspective_names_core, perspectives_core):
    print(name)
    print(df['label'].value_counts())
    print()

Word counts per cluster
cultural_core
-1    18
 1     6
 2     4
 0     3
Name: label, dtype: int64

relational_core
-1    15
 1     8
 0     5
 2     3
Name: label, dtype: int64

demographic_core
-1    21
Name: label, dtype: int64



In [22]:
df = perspectives_core[0]
print("Clusters for the %s dictionary" % perspective_names_core[0])
print(df.groupby('label').get_group(0)["item"])
print(df.groupby('label').get_group(1)["item"])
print(df.groupby('label').get_group(2)["item"])

Clusters for the cultural_core dictionary
11      imitation
17    isomorphism
21        mimetic
Name: item, dtype: object
14    institutionalize
15    institutionalize
18          legitimacy
19          legitimate
20        legitimation
27         rationalize
Name: item, dtype: object
4     conformity
22          norm
23     normative
24         norms
Name: item, dtype: object


In [23]:
df = perspectives_core[1]
print("Clusters for the %s dictionary" % perspective_names_core[1])
print(df.groupby('label').get_group(0)["item"])
print(df.groupby('label').get_group(1)["item"])
print(df.groupby('label').get_group(2)["item"])

Clusters for the relational_core dictionary
2      constrain
4        control
8     cooptation
17     influence
27         power
Name: item, dtype: object
6         cooperation
14           exchange
18    interdependence
19     interdependent
20          interlock
21       interlocking
25            network
26            network
Name: item, dtype: object
5     cooperate
22        merge
24       merger
Name: item, dtype: object


In [34]:
clusterer = hdbscan.HDBSCAN(metric='precomputed')
clusterer.fit(combined_distance_matrix_core)
combined_labels_core = clusterer.labels_

In [35]:
for p in perspectives_core:
    dict_len = len(p)
    labels, combined_labels = combined_labels_core[:dict_len], combined_labels_core[dict_len:]
    p["combined_label"] = labels

In [44]:
for p, name in zip(perspectives_core, perspective_names_core):
    print("Included in single cluster from %s" % name)
    print(p.groupby('combined_label').get_group(0)["item"])

Included in single cluster from cultural_core
1             coercion
2             coercive
3              conform
4           conformity
9            diffusion
11           imitation
12          innovation
13       institutional
14    institutionalize
15    institutionalize
16          isomorphic
17         isomorphism
18          legitimacy
19          legitimate
20        legitimation
21             mimetic
23           normative
24               norms
27         rationalize
28        rationalized
Name: item, dtype: object
Included in single cluster from relational_core
1           coalition
2           constrain
3          constraint
4             control
9          dependence
11    diversification
12          diversify
13          dominance
14           exchange
15           external
16         horizontal
17          influence
18    interdependence
19     interdependent
20          interlock
21       interlocking
23             merged
24             merger
27              power
28

## Dimensionality Reduction
From hdbscan documentation: "In general HDBSCAN can do well on up to around 50 or 100 dimensional data, but performance can see significant decreases beyond that." 
Our word vectors contain 300 features, 6-7 times more features than dictionary entries for each perspective. With full vectors, HDBScan classifies everything as noise.