**Imports**

In [None]:
from clean import complete_metadata_code
from clean import complete_abstracts_code
from clean import complete_vocab_code
from the_data import code_the_data
from clustering import from_the_data_to_dendogram
from time_series import doc_and_year
from time_series import plot_one_topic
from getting_data import source_to_xml
from implement_lda import from_abstracts_to_topics
from implement_lda import strip_abstract
from topic_interpreter import find_nt
from topic_interpreter import topic_words
from plotter import plot_average

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from nltk import word_tokenize
from gensim import corpora, models, similarities
from scipy.cluster.hierarchy import dendrogram

**Getting data**

In [None]:
#source = '/Users/elise/Documents/Capstone/Data/all_xml_jstor'
metadata = source_to_xml(source)

**Cleaning data**

In [None]:
complete_metadata = complete_metadata_code(metadata)

- Variables; journals is all journals unless another list is defined

In [None]:
journals = [x[3][0] for x in tqdm_notebook(complete_metadata)]

physics_journals = ['Proceedings: Mathematical, Physical and Engineering Sciences', 
                    'Proceedings: Mathematical and Physical Sciences',
                    'Proceedings of the Royal Society of London. Series A, Mathematical and Physical Sciences',
                    'Proceedings of the Royal Irish Academy. Section A: Mathematical and Physical Sciences',
                    'Philosophical Transactions: Physical Sciences and Engineering',
                    'Philosophical Transactions: Mathematical, Physical and Engineering Sciences',
                    'Philosophical Transactions of the Royal Society of London. Series A, Mathematical and Physical Sciences']

journals = physics_journals

In [None]:
all_years = [str(element[2][0]) for element in complete_metadata if any(element[3][0] == x for x in journals)]

In [None]:
complete_abstracts = complete_abstracts_code(journals, complete_metadata)
complete_vocab = complete_vocab_code(complete_abstracts)

In [None]:
print 'The amount of complete entries in this subset of the data: ', len(complete_abstracts)

**Finding Number of Topics**

In [None]:
complete_vocab_set = set(complete_vocab)

stripped_abstracts = []
for abstract in complete_abstracts:
    stripped_abstracts.append(strip_abstract(abstract, complete_vocab_set))
    
dictionary = corpora.Dictionary(stripped_abstracts)

corpus = []
for text in stripped_abstracts:
    corpus.append(dictionary.doc2bow(text))

In [None]:
all_topic_coherence = []
for i in tqdm_notebook(range(100)[1:]):
    all_topic_coherence.append(find_nt(i, corpus, dictionary))

In [None]:
def convert_to_mean_and_std(one_topic_coherence):
    the_list = [float(i.split(']')[0][1:]) for i in one_topic_coherence[0][:-3]]
    mean = np.mean(the_list)
    std_dev = np.std(the_list)
    return [mean, std_dev]

mean_and_std_for_all = [convert_to_mean_and_std(x) for x in all_topic_coherence]

In [None]:
def plot_topic_coherence(minrange, maxrange, dataset):
    
    y = [x[0] for x in mean_and_std_for_all]
    std = [x[1] for x in mean_and_std_for_all]
    x = range(99)

    plt.errorbar(x, y, std, color='black', linestyle='-', marker='o', markersize=4, 
                 markerfacecolor='black', elinewidth=0.5, markeredgecolor='black')

    plt.title('All - Number of Topics', fontsize=12)
    plt.ylabel('Average Topic Coherence', fontsize=10)
    plt.xlabel('Number of Topics', fontsize=10)
    plt.ylim([0, 0.04])
    plt.xlim([minrange, maxrange])
    plt.grid(True)
    plt.yticks(fontsize = 7, weight='light')
    plt.xticks(fontsize = 7, weight='light')

    #plt.savefig('/Users/elise/Documents/Capstone/Graphs/TC-physics-' + str(minrange) + '-' + str(maxrange) + '.png')
    plt.show()

In [None]:
plot_topic_coherence(0, 100)
plot_topic_coherence(40, 60)

- Based on graph the number of topics is decided

In [None]:
n = 54

In [None]:
for x in all_topic_coherence[n][1]:
    print topic_words(x)

**Implementing LDA**

In [None]:
stripped_abstracts, lda_model, corpus, model, dictionary = from_abstracts_to_topics(complete_abstracts, complete_vocab, n)

**Constructing time series**

In [None]:
with_doc_n_and_year = doc_and_year(corpus, all_years, model)

In [None]:
the_data = code_the_data(with_doc_n_and_year, all_years, n)

**Clustering**

In [None]:
Z = from_the_data_to_dendogram(n, the_data)

In [None]:
plt.figure(figsize=(20, 8))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(Z, color_threshold=0.6)

fig_x = plt.figure(1)
ax_x = fig_x.add_subplot(111)
plt.yticks(fontsize = 10, weight='light')
plt.xticks(fontsize = 15, weight='light')

#plt.savefig('/Users/elise/Documents/Capstone/Graphs/dendrogram-physics.png')
plt.show()

In [None]:
print dendrogram(Z, color_threshold=0.6)['leaves']

cluster1 = [25, 35]
cluster3 = [23, 28, 32, 39, 10, 20, 36, 7, 50]
cluster4 = [47, 8, 5, 53, 27, 13, 33, 41, 49, 38, 9, 44, 15, 1, 16, 31, 24, 29, 12, 43, 21, 3, 51, 22, 45] 
cluster5 = [19, 18, 37, 52]
cluster6 = [0, 14, 26, 6, 48, 11, 4, 17, 34, 30, 42, 40, 2, 46]

In [None]:
plot_average(the_data, cluster1, 1, 'physics-')
plot_average(the_data, cluster3, 2, 'physics-')
plot_average(the_data, cluster4, 3, 'physics-')
plot_average(the_data, cluster5, 4, 'physics-')
plot_average(the_data, cluster6, 5, 'physics-')