In [37]:
# -*- coding: utf-8 -*-
import json
import io, os, shutil
import subprocess
import folium
    

#function to create topic maps based on JSON files by the HMDP topic model
def map_from_JSON(base_folder, runs, color='auto', marker_size=10):

    #we only create a map for the final run folder.
    #comment the next line to create maps for all folders
    final_run_folder = base_folder + "/output_HMDP/" + str(runs) +"/";
    
    #traverse folders containing geojson files
    folders = [x[0] for x in os.walk(final_run_folder) if x[0].endswith("_geojson")];
    for folder in folders:
        print("opening folder "+folder+":");

        #Create new folium map class
        f_map = folium.Map(location=[50, 6], tiles='Stamen Toner', zoom_start=1);

        #traverse geoJSON files
        files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)) & f.endswith(".geojson")];
        files.sort();
        for file in files:
            print("processing "+file+" ...");

            with open(folder+'/'+file) as f:
                geojson = json.load(f)

            icon_size = (14, 14)

            #traverse geoJSON features
            feature_group = folium.FeatureGroup(file.split(".")[0]);
            for feature in geojson['features']:
                #we get position, colour, transparency from JSON
                lat, lon = feature['geometry']['coordinates'];
                if color == 'auto':
                    fillColor = "#"+feature['properties']['fillColor'];
                else:
                    fillColor = color;
                fillOpacity = feature['properties']['fillOpacity'];
                marker = folium.CircleMarker([lat, lon], 
                                             fill_color=fillColor, 
                                             fill_opacity=fillOpacity,
                                             color = "none",
                                             radius = marker_size)
                feature_group.add_child(marker);

            f_map.add_child(feature_group);
            f.close();

        #add layer control to activate/deactivate topics
        folium.LayerControl().add_to(f_map);    
        #save map
        f_map.save(folder+'/topic_map.htm')
        print('created map in: '+folder+'/topic_map.htm');
        
#function to call the HMDP topic model. Manual:
#https://github.com/gesiscss/promoss
def HMDP_topicmodel(
    directory,
    meta_params,
    T=100,
    RUNS=200,
    SAVE_STEP=10,
    TRAINING_SHARE=1.0,
    BATCHSIZE=128,
    BATCHSIZE_GROUPS=128,
    BURNIN=0,
    BURNIN_DOCUMENTS=0,
    INIT_RAND=0,
    SAMPLE_ALPHA=1,
    BATCHSIZE_ALPHA=1000,
    MIN_DICT_WORDS=100,
    alpha_0=1,
    alpha_1=1,
    epsilon="none",
    delta_fix="none",
    rhokappa=0.5,
    rhotau=64,
    rhos=1,
    rhokappa_document=0.5,
    rhotau_document=64,
    rhos_document=1,
    rhokappa_group=0.5,
    rhotau_group=64,
    rhos_group=1,
    processed=True,
    stemming=False,
    stopwords=False,
    language="en",
    store_empty=True,
    topk=100):
 
    print("Running HMDP topic model... (please wait)");
    
    #if os.path.is_dir(directory+"/output_HMDP"):
    #    shutil.rmtree(directory+"/output_HMDP") 
    if os.path.isdir(directory+"/cluster_desc"):
        shutil.rmtree(directory+"/cluster_desc") 
    
    if os.path.isfile(directory+"/groups"):
        os.remove(directory+"/groups")
    if os.path.isfile(directory+"/groups.txt"):
        os.remove(directory+"/groups.txt")
    #os.remove(directory+"/texts.txt")
    #os.remove(directory+"/words.txt")
    #os.remove(directory+"/wordsets")
    
    if not os.path.isfile("../promoss.jar"):
        print("Could not find ../promoss.jar. Exit")
        return;

    
    process = subprocess.Popen(['java', '-jar', '../promoss.jar', 
                        '-directory', directory, 
                        '-T',str(T),
                        '-RUNS',str(RUNS),
                        '-SAVE_STEP',str(SAVE_STEP),
                        '-TRAINING_SHARE',str(TRAINING_SHARE),
                        '-BATCHSIZE',str(BATCHSIZE),
                        '-BATCHSIZE_GROUPS',str(BATCHSIZE_GROUPS),
                        '-BURNIN',str(BURNIN),
                        '-BURNIN_DOCUMENTS',str(BURNIN_DOCUMENTS),
                        '-INIT_RAND',str(INIT_RAND),
                        '-SAMPLE_ALPHA',str(SAMPLE_ALPHA),
                        '-BATCHSIZE_ALPHA',str(BATCHSIZE_ALPHA),
                        '-MIN_DICT_WORDS',str(MIN_DICT_WORDS),
                        '-alpha_0',str(alpha_0),
                        '-alpha_1',str(alpha_1),
                        '-epsilon',str(epsilon),
                        '-delta_fix',str(delta_fix),
                        '-rhokappa',str(rhokappa),
                        '-rhotau',str(rhotau),
                        '-rhos',str(rhos),
                        '-rhokappa_document',str(rhokappa_document),
                        '-rhotau_document',str(rhotau_document),
                        '-rhos_document',str(rhos_document),
                        '-rhokappa_group',str(rhokappa_group),
                        '-rhotau_group',str(rhotau_group),
                        '-rhos_group',str(rhos_group),
                        '-processed',str(processed),
                        '-stemming',str(stemming),
                        '-stopwords',str(stopwords),
                        '-language',str(language),
                        '-store_empty',str(store_empty),
                        '-topk',str(topk)
                        ], stdout=subprocess.PIPE)    
    output = "";
    while True:
        output = process.stdout.readline()
        if (output == '') | (process.poll() is not None):
            break
        if output:
            output = str(output).strip()[2:-1].replace("\\n","");
            print(output)
    rc = process.poll()
    
    print("...creating maps...");
    map_from_JSON(directory,RUNS);
    print("...done.");

In [None]:
#directory of meta.txt and corpus.txt
directory = "/home/c/work/topicmodels/geo_test/";
#the first value in meta.txt are Geographical coordinates
#and we want to detect 100 clusters
meta_params = "G(100)";
#10 topics
T = 10;
#we only keep words which appear at least 2 times
MIN_DICT_WORDS = 2;
#run model
HMDP_topicmodel(directory,meta_params,T,MIN_DICT_WORDS=MIN_DICT_WORDS)

Running HMDP topic model... (please wait)
Clustering metadata...
Geographical clustering step 0 (Likelihood: 1.0)
Geographical clustering step 1 (Likelihood: -3476497.185869844)
Geographical clustering step 2 (Likelihood: -2.10056534254786355E18)
Geographical clustering step 3 (Likelihood: -2.03048703021529728E18)
Creating dictionary...
Initialising parameters...
Reading groups...
Processing documents...
.....................................................
Estimating topics...
/home/c/work/topicmodels/geo_test/ run 0 (Topics 0 alpha_0 1.0 alpha_1 1.0 beta_0 0.01 gamma 1.0 delta 1.0 epsilon 1.0
.....................................................
/home/c/work/topicmodels/geo_test/ run 1 (Topics 10 alpha_0 1.0 alpha_1 1.0 beta_0 0.01 gamma 1.0 delta 1.0 epsilon 1.0
.....................................................
/home/c/work/topicmodels/geo_test/ run 2 (Topics 10 alpha_0 5.120616947348738 alpha_1 1.0 beta_0 0.06995314512172907 gamma 1.0 delta 74.7200570359124 epsilon 1.0
........