In [1]:
import numpy as np

# Be sure to restart the notebook kernel if you make changes to parseTandA
# Re-running this cell does not re-load the module otherwise
from helper import *

# We use matplotlib for plotting. You can basically get any plot layout/style
# etc you want with this module. I'm setting it up for basics here, meaning
# that I want it to parse LaTeX and use the LaTeX font family for all text.
# !! If you don't have a LaTeX distribution installed, this notebook may
#    throw errors when it tries to create the plots. If that happens, 
#    either install a LaTeX distribution or remove/comment the 
#    matplotlib.rcParams.update(...) line.
#    In both cases, restart the kernel of this notebook afterwards.
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

rcparams = {                      
    "pgf.texsystem": "pdflatex",        # change this if using xetex or lautex
    "text.usetex": True,                # use LaTeX to write all text
    "font.family": "lmodern",
    "font.serif": [],                   # blank entries should cause plots to inherit fonts from the document
    "font.sans-serif": [],
    "font.monospace": [],          
    "font.size": 12,
    "legend.fontsize": 12,         
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "pgf.preamble": [
        r"\usepackage[utf8x]{inputenc}",    # use utf8 fonts becasue your computer can handle it :)
        r"\usepackage[T1]{fontenc}",        # plots will be generated using this preamble
        ]
}
matplotlib.rcParams.update(rcparams)

# Load the title dataset

In [3]:
re_parse = False
if re_parse:
    all_titles = load_and_parse_all_titles('alltitles.txt')
    # Save to a file, so we can load it much faster than having
    # to re-parse the raw data.
    np.save("alltitles.npy", all_titles)
else:
    # Load the titles from the file.
    # The atleast_2d is a hack for correctly loading the dictionary...
    all_titles = np.atleast_2d(np.load("alltitles.npy"))[0][0]

In [4]:
# Check the available years
all_years = sorted(list(all_titles.keys()))
print(all_years)

[1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]


## Phrase detection

In [7]:
titles = get_titles_for_years(all_titles, all_years)
ngram_titles, bigrams, ngrams = get_ngram_titles(titles)

In [8]:
# train word2vec 
model = gensim.models.Word2Vec(ngram_titles, window=10, min_count=1, size=128)

In [9]:
print("Majorana + Braiding = %s"%(model[size].most_similar(positive=['majorana', 'braiding'], topn=2)))
print("2D + electrons + magnetic field = %s"%(model[size].most_similar(positive=['two_dimensional', 'electron', 'magnetic_field'], topn=3)))
print("Electron + Hole = %s"%(model[size].most_similar(positive=['electron', 'hole'], topn=3)))
print("Electron - charge = %s"%(model[size].most_similar(positive=['electron'], negative=['charge'], topn=3)))
print("Superconductor + Topological = %s"%(model[size].most_similar(positive=['superconductor', 'topological'], topn=2)))
print("Lattice + Force = %s"%(model[size].most_similar(positive=['lattice', 'force'], topn=2)))
print("Spin + Magnetic Field = %s"%(model[size].most_similar(positive=['spin', 'magnetic_field'], topn=2)))
print("Electron + spin = %s"%(model[size].most_similar(positive=['electron', 'spin'], topn=2)))
print("particle + charge = %s"%(model[size].most_similar(positive=['particle', 'charge'], topn=2)))
print("fermion - mass - charge = %s"%(model[size].most_similar(positive=['fermion'], negative=['mass', 'charge'], topn=2)))
print("\n")
print("superconductor = %s"%(model[size].most_similar(positive=['superconductor'], topn=10)))
print("majorana = %s"%(model[size].most_similar(positive=['majorana'], topn=10)))
print("topological = %s"%(model[size].most_similar(positive=['topological'], topn=10)))

Majorana + Braiding = [('non_abelian', 0.8771545886993408), ('majorana_mode', 0.8684729933738708)]
2D + electrons + magnetic field = [('landau_level', 0.6424900889396667), ('rashba', 0.6295487284660339), ('hole', 0.614865243434906)]
Electron + Hole = [('carrier', 0.7396241426467896), ('gaa', 0.6730054020881653), ('electron_ga', 0.6618585586547852)]
Electron - charge = [('electron_ga', 0.3689453899860382), ('layer_black', 0.34713971614837646), ('digging', 0.33736151456832886)]
Superconductor + Topological = [('weyl_semimetal', 0.7662055492401123), ('topological_insulator', 0.7345409989356995)]
Lattice + Force = [('potential', 0.5878475904464722), ('interparticle_interaction', 0.5842046737670898)]
Spin + Magnetic Field = [('antiferromagnetic', 0.6968852877616882), ('magnetization', 0.6883454322814941)]
Electron + spin = [('magnon', 0.6827082633972168), ('orbital', 0.6752756834030151)]
particle + charge = [('electron', 0.5515502095222473), ('quasiparticle', 0.5027873516082764)]
fermion - 

## Clustering

In [10]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=1000, random_state=0).fit(model.wv.syn0)

In [11]:
sets = {}
for l in np.unique(kmeans.labels_):
    sets[l] = []
for idx,l in enumerate(sorted(kmeans.labels_)):
    sets[l].append(model.wv.index2word[idx])

In [12]:
print(sets.keys())
for k in sets.keys():
    print(k, len(sets[k]))

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,

In [15]:
print(sets[11])

['infrared', 'electrode', 'are', 'anderson_localization', 'acoustic']
