# Clustering

In this notebook, we assign a burstiness score to each term in the papers dataset, 2006-2020, then select the 1000 burstiest terms and cluster them. These clusters are then copied into the cluster_choice.xlsx spreadsheet.



In [1]:
import os
import random
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from scipy.spatial.distance import squareform
from scipy.cluster import hierarchy
import pickle
import time
import csv
import sys
sys.path.append("../../tools")

import burst_detection
import tools
import my_parameters
import logletlab

import my_stopwords3
from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import StrMethodFormatter, NullFormatter
import matplotlib.ticker as mticker
from tqdm import tqdm

from IPython.display import HTML, display
import tabulate

def reverse_cumsum(ls):
    reverse = np.zeros_like(ls)
    for i in range(len(ls)):
        if i == 0:
            reverse[i] = ls[i]
        else:
            reverse[i] = ls[i]-ls[i-1]
            
            
    if reverse[0]>reverse[1]:
        reverse[0]=reverse[1]
            
    return reverse

def detransform_fit(ypc, F):
    '''
    The Gompertz and Logistic curves actually model *cumulative* frequency over time, not raw frequency. 
    However, raw frequency is more intuitive for graphs, so we use this function to change a cumulative 
    time series into a non-cumulative one. Additionally, the models were originally fitted to scaled curves
    (such that the minumum frequency was zero and the maximum was one). This was done to make it possible to 
    directly compare the error between different time series without a much more frequent term dwarfing the calculation.
    We now transform back.
    '''
    yf = reverse_cumsum(F*(max(ypc)-min(ypc)) + min(ypc))
    return yf

stop = my_stopwords3.get_stopwords()
stop.add('using')
stop.add('use')
stop.add('uses')
stop.add('used')
stop.add('model')
stop.add('method')
stop.add('approach')
stop.add('based')

In [2]:
years = list(range(2006, 2021))
vocabulary = pickle.load(open("../vocabulary.p", "rb"))
stacked_vectors = pickle.load(open("../stacked_vectors/semantic_scholar.p", "rb"))
document_count_per_year = pickle.load(open("../stacked_vectors/semantic_scholar_document_count.p", "rb"))

prevalence = stacked_vectors.divide(document_count_per_year['documents'], axis=0).loc[years]
parameters = my_parameters.set_parameters()

print('significance threshold:', parameters['significance_threshold'])
print('years above significance:', parameters['years_above_significance'])
print('significance ma length:', parameters['significance_ma_length'])


significance threshold: 0.0015
years above significance: 3
significance ma length: 3


### Apply burst detection

In [None]:
bd_dataset = burst_detection.Dataset(
    name = "semantic_scholar", 
    years = years, 
    stacked_vectors = prevalence
)

bd_dataset.get_sig_stacked_vectors(parameters["significance_threshold"], parameters["years_above_significance"])
print(bd_dataset.sig_stacked_vectors.shape)

bd_dataset.get_burstiness(parameters["short_ma_length"], parameters["long_ma_length"], parameters["significance_ma_length"], parameters["signal_line_ma"])

bd_dataset.get_burstiness(parameters["short_ma_length"], parameters["long_ma_length"], parameters["significance_ma_length"], parameters["signal_line_ma"])
bursts = tools.get_top_n_bursts(bd_dataset.burstiness, 1000)

### Calculate co-occurence of bursts

There are two ways to do this
1. Calculate based on co-occurence of bursty terms
2. Calculate based on co-occurence of all terms

In this case, I think the correct answer is 1. We want tight clusters of terms that are very co-related. Our clustering will also have a manual aspect, because my domain knowledge in this field means I can collapse trivial clusters into each other.

In [38]:
vectorizer = CountVectorizer(strip_accents='ascii', 
                             ngram_range=(1,4),
                             stop_words=stop,
                             vocabulary=bursts)

vectors = []
for year in years:
    t0 = time.time()
    with open("../../Data/semantic_scholar_cleaned_langdetect/"+str(year)+".txt", "r") as f:
        documents = f.readlines()
        documents = [d.strip() for d in documents] 

    vectorizer = CountVectorizer(strip_accents='unicode',
                             ngram_range=(1,4),
                             vocabulary=bursts,
                             stop_words=stop
                            )
    
    vector = vectorizer.fit_transform(documents)
    
    del documents
    
    vector[vector>1] = 1    
    vectors.append(vector)
    
    del vector
    
    print(year, time.time()-t0)
    


2006 42.46871376037598
2007 48.72326636314392
2008 46.577659130096436
2009 52.749733448028564
2010 55.083566665649414
2011 61.355138063430786
2012 65.35193872451782
2013 69.37733364105225
2014 73.18735694885254
2015 77.63297581672668
2016 100.17408037185669
2017 126.26783609390259
2018 97.63837885856628
2019 115.65487456321716
2020 106.81793999671936


In [39]:
v = vectors[0]
c = v.T*v
c.setdiag(0)
c = c.todense()

cooccurrence = c

for v in vectors[1:]:
    c = v.T*v
    c.setdiag(0)
    c = c.todense()
    cooccurrence += c
    
pickle.dump(cooccurrence, open('semantic_scholar_cooccurrence_matrix_2006.p', "wb"))
pickle.dump(bursts, open('semantic_scholar_cooccurrence_vocabulary_2006.p', "wb"))



  self._set_arrayXarray(i, j, x)


In [4]:
cooccurrence = pickle.load(open('semantic_scholar_cooccurrence_matrix_2006.p', "rb"))
bursts = pickle.load(open('semantic_scholar_cooccurrence_vocabulary_2006.p', "rb"))


# Translate co-occurence into a distance
dists = np.log(cooccurrence+1).max()- np.log(cooccurrence+1)

# Remove the diagonal (squareform requires diagonals be zero)
dists -= np.diag(np.diagonal(dists))

# Put the distance matrix into the format required by hierachy.linkage
flat_dists = squareform(dists)

# Get the linkage matrix
linkage_matrix = hierarchy.linkage(flat_dists, "ward")

assignments = hierarchy.fcluster(linkage_matrix, 7, 'distance')

clusters = defaultdict(list)

for term, assign, co in zip(bursts, assignments, cooccurrence):
    clusters[assign].append(term)


for key in sorted(clusters.keys()):
    terms = [t for t in clusters[key]]
    total+=len(terms)
    n2006 = [stacked_vectors[t][2006] for t in clusters[key]]
    peak = [prevalence[t].idxmax() for t in clusters[key]]
    
    if min(n2006) > 20:
        # Ignore bursts
        pass
    elif max(peak) < 2008:
        pass
    else:
        tally+=1
        print( ', '.join(clusters[key])+'|'+
              ', '.join([str(t) for t in n2006])+'|'+
              ', '.join([str(t) for t in peak]))
print(total, len(clusters), tally)

github, github com, http github com, http github, com|0, 0, 0, 0, 454|2020, 2020, 2020, 2020, 2020
availability implementation, contact, edu, gene, protein, sequencing, sequence|6, 1033, 412, 2139, 1889, 335, 7874|2016, 2017, 2011, 2007, 2006, 2016, 2006
web, linked data, linked, rdf, ontology, semantic, semantic web|9139, 17, 787, 282, 2816, 4644, 1208|2006, 2014, 2014, 2014, 2008, 2008, 2006
knowledge graph|9|2020
embeddings, embedding, word embeddings|106, 1025, 0|2020, 2020, 2019
learn, representation learning, learns, learned, jointly, feature representation|1857, 14, 387, 1339, 743, 75|2020, 2020, 2020, 2020, 2020, 2020
lstm, long short term, long short, long short term memory, short term memory, term memory, short term, memory lstm, term memory lstm, short term memory lstm, short, long|5, 12, 37, 7, 39, 98, 470, 3, 3, 3, 3532, 4570|2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020
achieves state, achieves state ofdashthe, achieves state ofdashthe art|7, 6, 6

In [9]:
new_clusters = [
['knowledge graph'],
['word embeddings'],
['lstm', 'long short term memory'],
['semantic segmentation'],
['deep neural', 'dnn'],
['generative adversarial', 'gan'],
['autoencoder'],
['compressive sensing', 'compressed sensing'],
['sdn', 'software defined networking'],
['big data'],
['mapreduce', 'hadoop'],
['cloud computing', 'cloud service', 'cloud environment'],
['blockchain'],
['smart grid', 'smart city'],
['cyber physical'],
['internet thing', 'internet ofdashthings', 'iot'],
['kinect'],
['crowdsourcing'],
['facebook', 'twitter', 'social medium', 'tweet'],
['energy harvesting'],
['device todashdevice', 'd2d'],
['massive mimo'],
['noma', 'orthogonal multiple'],
['edge computing'],
['5g', 'fifth generation'],
['deep reinforcement learning']]

In [11]:
for cluster in new_clusters:
    max_freq = [stacked_vectors[t].max() for t in cluster]
    n2006 = [stacked_vectors[t][2006] for t in cluster]
    burstiness = [np.round(10*bd_dataset.burstiness['max'][t],2) for t in cluster]
    print(
          ', '.join([str(f) for f in max_freq])+'|'+
          ', '.join([str(f) for f in burstiness])+'|'+
          ', '.join([str(f) for f in n2006])
         )


1457|0.54|9
1050|0.6|0
3264, 2717|1.04, 0.9|5, 7
1617|0.79|5
8471, 2263|1.06, 0.62|0, 6
2918, 2137|1.15, 0.83|0, 41
1999|0.6|0
598, 654|0.47, 0.51|3, 13
1658, 977|1.08, 0.8|4, 0
4939|1.72|5
887, 772|0.66, 0.58|4, 1
3617, 1247, 799|1.9, 0.98, 0.71|7, 1, 2
3508|1.35|0
1220, 1799|1.2, 0.57|2, 2
1844|0.17|1
7812, 867, 9343|0.87, 0.33, 1.16|10, 0, 5
796|1.05|0
1066|0.75|0
1213, 2142, 3595, 1346|0.67, 1.04, 0.76, 0.81|5, 1, 5, 1
1199|0.35|18
785, 818|0.74, 0.83|3, 4
1094|0.64|0
1061, 1029|0.59, 0.56|0, 4
2499|1.1|6
3895, 887|0.96, 0.71|2, 2
2012|0.98|0


### For each cluster, create a time series of mentions in abstracts over time

We now need to search for the clusters to pull out the frequency of appearance in abstracts over time. For the cluster ["Internet of things", "IoT"], all abstracts that mention **either** term are included (i.e. an abstract that uses "Internet of things" without the abbreviation "IoT" still counts towards the total for that year). We take document frequency, not term frequency, so the number of times the terms are mentioned in each document do not matter, so long as they are mentioned once.

In [65]:
clusters = pd.read_csv('clusters2.csv')
cluster_list = [c.split(', ') for c in clusters['terms']]

# List all the cluster terms. This will be more than the total number of clusters.
all_cluster_terms = sum(cluster_list,[])

# Get the cluster titles. This is the list of terms in each cluster
cluster_titles = list(clusters['title'])

years = list(range(2006,2021))

# This is where we will store the data. The columns correspond to clusters, the rows to years
prevalence_array = np.zeros([len(years), len(cluster_list)])
    
for i, year in enumerate(tqdm(years)):
    t0 = time.time()
    with open("../../Data/semantic_scholar_cleaned_langdetect/"+str(year)+".txt", "r") as f:
        documents = f.readlines()
        documents = [d.strip() for d in documents] 
        
    vectorizer = CountVectorizer(strip_accents='unicode',
                             ngram_range=(1,4),
                             vocabulary=all_cluster_terms,
                             stop_words=stop
                            )
    
    vector = vectorizer.fit_transform(documents)
    
    del documents
    
    for j, cluster in enumerate(cluster_list):
        indices = []
        for term in cluster:
            indices.append(all_cluster_terms.index(term))

            # If there are multiple terms in a cluster, sum the cluster columns together
            summed_column = np.squeeze(np.asarray(vector[:,indices].sum(axis=1).flatten()))
            # Set any element greater than one to one--we're only counting documents here, not 
            # total occurrences
            summed_column[summed_column!=0] = 1

            # This is the total number of occurrences of the cluster per year
            prevalence_array[i, j] = np.sum(summed_column)
    
    
    
# Save the data
df = pd.DataFrame(data=prevalence_array, index=years, columns=cluster_titles) 
pickle.dump(df, open('../cluster_prevalence/papers.p', 'wb'))


  0%|                                                                                           | 0/15 [00:00<?, ?it/s][A
  7%|█████▌                                                                             | 1/15 [00:40<09:29, 40.70s/it][A
 13%|███████████                                                                        | 2/15 [01:25<09:06, 42.08s/it][A
 20%|████████████████▌                                                                  | 3/15 [02:14<08:48, 44.07s/it][A
 27%|██████████████████████▏                                                            | 4/15 [03:09<08:39, 47.23s/it][A
 33%|███████████████████████████▋                                                       | 5/15 [04:09<08:31, 51.11s/it][A
 40%|█████████████████████████████████▏                                                 | 6/15 [05:13<08:13, 54.87s/it][A
 47%|██████████████████████████████████████▋                                            | 7/15 [06:21<07:50, 58.82s/it][A
 53%|██████████