In [1]:
import json
import codecs
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation #sklearn.decomposition.NMF

'''
Topic modelling for currencies.

Prereq for data: /Users/dylan/_git/drh-data-scraper/src/0_run_all.sh

Run me:
python /Users/dylan/_git/drh-cryptopulse/crawlers/coinmarketcap/wf7_temp_currency_topic_modelling.py

Ref: 
    https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
    https://de.dariah.eu/tatom/topic_model_python.html
'''


'\nTopic modelling for currencies.\n\nPrereq for data: /Users/dylan/_git/drh-data-scraper/src/0_run_all.sh\n\nRun me:\npython /Users/dylan/_git/drh-cryptopulse/crawlers/coinmarketcap/wf7_temp_currency_topic_modelling.py\n\nRef: \n    https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730\n    https://de.dariah.eu/tatom/topic_model_python.html\n'

In [2]:
def get_docs():
    infolder = '/Users/dylan/_git/drh-cryptopulse/crawlers/coinmarketcap/data_coinmarketcap/'
    infile_generic_websites = infolder + '3-generic-website.json'

    with open(infile_generic_websites) as f:
        json_generic_websites = json.load(f)

    websites = list(x for x in json_generic_websites) # if x['_url_desc'] == currency_name)  # Can be mulitple

    documents = []
    documents_names = []

    # Website(s) rows (can be more than 1 for a currency, e.g. bitcoin.com and bitcoin.org)
    for website in websites:

        exclude_tags = ['a', 'option']
        min_content_length = 2
        join_string = '. '

        # Create joined_content from list of ('tag', 'content text') items
        content_lines = website.get('content_lines', None)
        if content_lines:
            content_lines = filter(lambda x: x[0] not in exclude_tags, content_lines)
            content_lines = [x[1] for x in content_lines]
            content_lines = filter(lambda x: len(x) > min_content_length, content_lines)
            website['content_lines_joined'] = join_string.join(content_lines)

            documents.append(website['content_lines_joined'])
            documents_names.append('{} - {}'.format(website['_url_desc'], website['_url_website']))

    #documents = [x for x in website.get('content_lines_joined', None) if x != None]

    return documents_names, documents
    

def get_topics(model, feature_names, no_top_words):
    result = []
    
    for topic_idx, topic in enumerate(model.components_):
        topic_result = ", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        print("Topic %d: \t%s" % (topic_idx, topic_result))
        result.append(topic_result)

    return result


def get_doctopic_relations(method, model, tf, documents_names, feature_names, header=False):
    doc_topic_distrib = model.transform(tf)  # https://stackoverflow.com/questions/34429635/topic-modelling-assign-a-document-with-top-2-topics-as-category-label-sklear

    components = range(len(model.components_))
    components = list(map(lambda x: 'Topic ' + str(x), components))

    result = []

    if header:
        result.append('Method\tCurrency\tWebsite\t' + '\t'.join(components))

    for i in range(len(documents_names)):
        document_name = documents_names[i].replace('http:', '').replace('https:', '').replace('/', '')
        topic_dist = doc_topic_distrib[i]
        topic_dist_pc = list(map(lambda x: str(int(x*100)), topic_dist))
        info = '{}\t{}\t{}'.format(method, document_name, '\t'.join(topic_dist_pc))
        result.append(info)

    return result


'''
Calculate cosine similarities between different topic distributions.
'''
def get_doctopic_similarities(model, tf, documents_names):
    doc_topic_distrib = model.transform(tf)  # https://stackoverflow.com/questions/34429635/topic-modelling-assign-a-document-with-top-2-topics-as-category-label-sklear

    components = range(len(model.components_))
    components = list(map(lambda x: 'Topic ' + str(x), components))

    matrix = []

    for i in range(len(documents_names)):
        document_name = documents_names[i].replace('http:', '').replace('https:', '').replace('/', '')
        topic_dist = doc_topic_distrib[i]
        matrix.append(topic_dist)

    cossim = cosine_similarity(np.array(matrix))
    return cossim

'''
Append column names to lefthand column of existing matrix outputted file.
'''
def append_lefthand_column_to_matrix(filename, column_names):
    lines = []
    with open(filename, 'r') as f:
        lines = f.readlines()

    write_column_names = column_names.copy()
    write_column_names.insert(0, '') # First row is blank - assuming there is a header with names already

    assert len(write_column_names) == len(lines), 'Column names count did not match row/line count'

    # TODO: just get column names from 1st row! Don't need to supply them.

    with open(filename, 'w') as f:
        for i in range(len(lines)):
            f.write(write_column_names[i] + '\t' + lines[i])


'''
Write out d3 graph formatted connections json file.

https://bl.ocks.org/mbostock/3750558 (Sticky Force Layout)
https://bl.ocks.org/mbostock/950642 (Labeled Force Layout)

'''
def write_d3_graph_json(filename, cossim_nmf, documents_names, threshold=0.7):
    
    with open(filename, 'w') as f:
        f.write('{ "nodes": [\n')

        first = True
        for name in documents_names:
            if not first:
                f.write(',')
            first = False
            f.write('{{"name": "{}", "group": 1}}\n'.format(name))
        
        f.write('],\n')
        
        f.write('"links": [\n')
        
        first = True
        for n in range(len(cossim_nmf)):
            for m in range(0, n):
                # if n < 6:
                #     print(n, m, cossim_nmf[n][m])
                if cossim_nmf[n][m] > threshold:
                    if not first:
                        f.write(',')
                    first = False
                    f.write('{{"source":{},"target":{},"value":1}}\n'.format(n, m))

        f.write(']}\n')


In [4]:
'''
Main
'''
def main():
    
    print('Started...')

    num_features = 1000
    display_num_top_words = 15
    n_components = 10 # Topic count

    max_docs = 50

    outfolder = '/Users/dylan/_git/drh-cryptopulse/crawlers/coinmarketcap/data_coinmarketcap/'

    documents_names, documents = get_docs()

    documents_names = documents_names[0:max_docs]
    documents = documents[0:max_docs]

    print('num docs:', len(documents))

    # TODO: check out: https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html

    # ######################################################################
    # TODO: LSA/LSI
    # ######################################################################

    # See: gensim.models.LsiModel (https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html#topic=2&lambda=0.53&term=)

    # ######################################################################
    # TODO: PLSA
    # ######################################################################


    # ######################################################################
    # LDA (Latent Dirichlet Allocation) using term counts.
    # ######################################################################

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

    print('\n\nLDA')
    topics_lda = get_topics(lda, tf_feature_names, display_num_top_words)

    relations_lda = get_doctopic_relations('lda', lda, tf, documents_names, tf_feature_names)
    with codecs.open(outfolder + '_topic_modeling_LDA.tsv', 'w', 'utf-8') as f: 
        f.write('\n'.join(relations_lda))
        f.write('\n')

    # Doc similarities
    cossim_lda = get_doctopic_similarities(lda, tf, documents_names)
    np.savetxt(outfolder + '_doc_cossim_LDA.tsv', cossim_lda, delimiter='\t', fmt='%1.2f', header='\t'.join(documents_names), comments='')
    append_lefthand_column_to_matrix(outfolder + '_doc_cossim_LDA.tsv', documents_names)


    # ######################################################################
    # NMF (Non-negative Matrix Factorization) using TFIDF values.
    # ######################################################################

    nmf_min_df = 10 #2

    # NMF is able to use tf-idf
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=nmf_min_df, max_features=num_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # Run NMF
    nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

    print('NMF')
    topics_nmf = get_topics(nmf, tfidf_feature_names, display_num_top_words)

    relations_nmf = get_doctopic_relations('nmf', nmf, tfidf, documents_names, tfidf_feature_names, True)
    with codecs.open(outfolder + '_topic_modeling_NMF.tsv', 'w', 'utf-8') as f: 
        f.write('\n'.join(relations_nmf))
        f.write('\n')

    # Doc similarities
    cossim_nmf = get_doctopic_similarities(nmf, tfidf, documents_names)
    np.savetxt(outfolder + '_doc_cossim_NMF.tsv', cossim_nmf, delimiter='\t', fmt='%1.2f', header='\t'.join(documents_names), comments='')
    append_lefthand_column_to_matrix(outfolder + '_doc_cossim_NMF.tsv', documents_names)

    write_d3_graph_json(outfolder + '_graph_NMF.json', cossim_nmf, documents_names)


    # ######################################################################
    # ALL
    # ######################################################################

    # TEMP: ??

    with codecs.open(outfolder + '_topic_modeling_ALL.tsv', 'w', 'utf-8') as f: 
        f.write('\n'.join(relations_nmf))
        f.write('\n')

    with codecs.open(outfolder + '_topic_modeling_ALL.tsv', 'a', 'utf-8') as f: 
        f.write('\n'.join(relations_lda))
        f.write('\n')

    print('Finished.')



In [5]:
if __name__ == '__main__':
    main()

Started...
num docs: 50


LDA
Topic 0: 	tokens, blockchain, platform, buyer, token, waves, ethereum, agreement, parties, assets, company, person, new, applications, use
Topic 1: 	buyer, tokens, agreement, company, software, token, distribution, io, blockchain, contract, block, parties, platform, including, 2017
Topic 2: 	ethereum, 2016, 2017, platform, blockchain, august, july, new, bitcoin, community, eth, wallet, contract, tokens, user
Topic 3: 	blockchain, bitcoin, network, platform, wallet, technology, decentralized, use, new, transactions, digital, development, community, 2017, world
Topic 4: 	blockchain, developer, bitcoin, end, buyer, network, tokens, 2017, new, technology, applications, wallet, world, market, decentralized
Topic 5: 	litecoin, days, pending, hours, btc, exchange, nov, site, core, 10, ltc, exchanges, 12, users, using
Topic 6: 	assets, digital, ethereum, daa, platform, 2017, 2016, blockchain, manage, various, performance, risk, asset, investors, index
Topic 7: 	ri