In [21]:
# -*- coding: utf-8 -*-
import json
import io, os, shutil
import subprocess
import folium
from IPython.core.display import HTML
from IPython.display import IFrame, display
import matplotlib.pyplot as plt


#function to create topic maps based on JSON files by the HMDP topic model
def map_from_JSON(base_folder, runs, color='auto', marker_size=10, show_map=False):

    #we only create a map for the final run folder.
    #comment the next line to create maps for all folders
    final_run_folder = base_folder + "/output_HMDP/" + str(runs) +"/";
    
    #traverse folders containing geojson files
    folders = [x[0] for x in os.walk(final_run_folder) if x[0].endswith("_geojson")];
    for folder in folders:
        print("opening folder "+folder+":");

        #Create new folium map class
        f_map = folium.Map(location=[50, 6], tiles='Stamen Toner', zoom_start=1);

        #traverse geoJSON files
        files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)) & f.endswith(".geojson")];
        files.sort();
        for file in files:
            print("processing "+file+" ...");

            with open(folder+'/'+file) as f:
                geojson = json.load(f)

            icon_size = (14, 14)

            #traverse geoJSON features
            feature_group = folium.FeatureGroup(file.split(".")[0]);
            for feature in geojson['features']:
                #we get position, colour, transparency from JSON
                lat, lon = feature['geometry']['coordinates'];
                if color == 'auto':
                    fillColor = "#"+feature['properties']['fillColor'];
                else:
                    fillColor = color;
                fillOpacity = feature['properties']['fillOpacity'];
                marker = folium.CircleMarker([lat, lon], 
                                             fill_color=fillColor, 
                                             fill_opacity=fillOpacity,
                                             color = "none",
                                             radius = marker_size)
                feature_group.add_child(marker);

            f_map.add_child(feature_group);
            f.close();

        #add layer control to activate/deactivate topics
        folium.LayerControl().add_to(f_map);    
        #save map
        f_map.save(folder+'/topic_map.htm')
        print('created map in: '+folder+'/topic_map.htm');
        f_map._repr_html_();
        #show map only if wanted, can consume quite some memory
        if show_map:
            if not os.path.exists("tmp"):
                os.makedirs("tmp");
            
            f_map.save("tmp/"+folder.split("/")[-1]+"_map.html");
            display(IFrame("tmp/"+folder.split("/")[-1]+"_map.html",width=400, height=400));
            display(f_map._repr_png());
        display(HTML('<a href="'+folder+'/topic_map.htm'+'">Link to map of '+folder.split("/")[-1].replace("_geojson","")+'</a>'));

#plot topic proportions
def plot_zeta(directory, RUNS):
    plt.bar("/output_HMDP/" + str(runs) +"/zeta");
    plt.xlabel("Features");
    plt.ylabel("Feature weight");
    plt.show();
    
    
#function to call the HMDP topic model. Manual:
#https://github.com/gesiscss/promoss
def HMDP_topicmodel(
    directory,
    meta_params,
    T=100,
    RUNS=200,
    SAVE_STEP=10,
    TRAINING_SHARE=1.0,
    BATCHSIZE=128,
    BATCHSIZE_GROUPS=128,
    BURNIN=0,
    BURNIN_DOCUMENTS=0,
    INIT_RAND=0,
    SAMPLE_ALPHA=1,
    BATCHSIZE_ALPHA=1000,
    MIN_DICT_WORDS=100,
    alpha_0=1,
    alpha_1=1,
    epsilon="none",
    delta_fix="none",
    rhokappa=0.5,
    rhotau=64,
    rhos=1,
    rhokappa_document=0.5,
    rhotau_document=64,
    rhos_document=1,
    rhokappa_group=0.5,
    rhotau_group=64,
    rhos_group=1,
    processed=True,
    stemming=False,
    stopwords=False,
    language="en",
    store_empty=True,
    topk=100):
 
    print("Running HMDP topic model... (please wait)");
    
    #if os.path.is_dir(directory+"/output_HMDP"):
    #    shutil.rmtree(directory+"/output_HMDP") 
    if os.path.isdir(directory+"/cluster_desc"):
        shutil.rmtree(directory+"/cluster_desc") 
    
    if os.path.isfile(directory+"/groups"):
        os.remove(directory+"/groups")
    if os.path.isfile(directory+"/groups.txt"):
        os.remove(directory+"/groups.txt")
    #os.remove(directory+"/texts.txt")
    #os.remove(directory+"/words.txt")
    #os.remove(directory+"/wordsets")
    
    if not os.path.isfile("../promoss.jar"):
        print("Could not find ../promoss.jar. Exit")
        return;
        
    process = subprocess.Popen(['java', '-jar', '../promoss.jar', 
                        '-directory', directory, 
                        '-T',str(T),
                        '-RUNS',str(RUNS),
                        '-SAVE_STEP',str(SAVE_STEP),
                        '-TRAINING_SHARE',str(TRAINING_SHARE),
                        '-BATCHSIZE',str(BATCHSIZE),
                        '-BATCHSIZE_GROUPS',str(BATCHSIZE_GROUPS),
                        '-BURNIN',str(BURNIN),
                        '-BURNIN_DOCUMENTS',str(BURNIN_DOCUMENTS),
                        '-INIT_RAND',str(INIT_RAND),
                        '-SAMPLE_ALPHA',str(SAMPLE_ALPHA),
                        '-BATCHSIZE_ALPHA',str(BATCHSIZE_ALPHA),
                        '-MIN_DICT_WORDS',str(MIN_DICT_WORDS),
                        '-alpha_0',str(alpha_0),
                        '-alpha_1',str(alpha_1),
                        '-epsilon',str(epsilon),
                        '-delta_fix',str(delta_fix),
                        '-rhokappa',str(rhokappa),
                        '-rhotau',str(rhotau),
                        '-rhos',str(rhos),
                        '-rhokappa_document',str(rhokappa_document),
                        '-rhotau_document',str(rhotau_document),
                        '-rhos_document',str(rhos_document),
                        '-rhokappa_group',str(rhokappa_group),
                        '-rhotau_group',str(rhotau_group),
                        '-rhos_group',str(rhos_group),
                        '-processed',str(processed),
                        '-stemming',str(stemming),
                        '-stopwords',str(stopwords),
                        '-language',str(language),
                        '-store_empty',str(store_empty),
                        '-topk',str(topk)
                        ], stdout=subprocess.PIPE)    
    output = "";
    while True:
        output = process.stdout.readline()
        if (output == '') | (process.poll() is not None):
            break
        if output:
            output = str(output).strip()[2:-1].replace("\\n","");
            print(output)
    rc = process.poll()
    
    print("...creating maps...");
    map_from_JSON(directory,RUNS);
    print("...done.");

In [22]:
#directory of meta.txt and corpus.txt
directory = "/home/c/work/topicmodels/food_test/";
#the first value in meta.txt are Geographical coordinates
#and we want to detect 1000 clusters
meta_params = "G(1000)";
#10 topics
T = 25;
#we only keep words which appear at least 2 times
MIN_DICT_WORDS = 2;
#run model
HMDP_topicmodel(directory,meta_params,T,MIN_DICT_WORDS=MIN_DICT_WORDS, RUNS = 10)

Running HMDP topic model... (please wait)
Clustering metadata...
Geographical clustering step 0 (Likelihood: 1.0)


KeyboardInterrupt: 

In [59]:
def map_from_JSON(base_folder, runs, color='auto', marker_size=10):

    #we only create a map for the final run folder.
    #comment the next line to create maps for all folders
    final_run_folder = base_folder + "/output_HMDP/" + str(runs) +"/";
    
    #traverse folders containing geojson files
    folders = [x[0] for x in os.walk(final_run_folder) if x[0].endswith("_geojson")];
    for folder in folders:
        print("Processing folder "+folder+"...");

        #Create new folium map class
        f_map = folium.Map(location=[50, 6], tiles='Stamen Toner', zoom_start=1);

        #traverse geoJSON files
        files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)) & f.endswith(".geojson")];
        topic_numbers = list(map(lambda x: filter(str.isdigit, x),files));
        print(topic_numbers)
        #[x for (y,x) in sorted(zip(Y,X))];
        
        files = [];
        files.sort();
        for file in files:
            #print("processing "+file+" ...");

            with open(folder+'/'+file) as f:
                geojson = json.load(f)

            icon_size = (14, 14);

            #traverse geoJSON features
            feature_group = folium.FeatureGroup(file.split(".")[0]);
            for feature in geojson['features']:
                #we get position, colour, transparency from JSON
                lat, lon = feature['geometry']['coordinates'];
                if color == 'auto':
                    fillColor = "#"+feature['properties']['fillColor'];
                else:
                    fillColor = color;
                fillOpacity = feature['properties']['fillOpacity'];
                marker = folium.CircleMarker([lat, lon], 
                                             fill_color=fillColor, 
                                             fill_opacity=fillOpacity,
                                             color = "none",
                                             radius = marker_size)
                feature_group.add_child(marker);

            f_map.add_child(feature_group);
            f.close();

        #add layer control to activate/deactivate topics
        folium.LayerControl().add_to(f_map);    
        #save map
        out_file = folder+'/topic_map.htm';
        f_map.save(out_file)
        print('created map in: '+out_file);
        
        if not os.path.exists("tmp"):
            os.makedirs("tmp")
        
        f_map.save("tmp/"+folder.split("/")[-1]+"_map.html");
        #display(IFrame("tmp/"+folder.split("/")[-1]+"_map.html",width=400, height=400));
        #display(f_map._repr_png());
        display(HTML('<a href="'+folder+'/topic_map.htm'+'">Link to map of '+folder.split("/")[-1].replace("_geojson","")+'</a>'));
        
map_from_JSON(directory, 10, color='auto', marker_size=10);

In [70]:
HTML('<a href="#">Link to map</a>')


In [16]:
%run 'hmdp.ipynb'

#directory of meta.txt and corpus.txt
directory = "/home/c/work/topicmodels/food_test/";
#the first value in meta.txt are Geographical coordinates
#and we want to detect 1000 clusters
meta_params = "G(1000)";
#10 topics
T = 25;
#we only keep words which appear at least 2 times
MIN_DICT_WORDS = 2;
#run model

hmdp = HMDP(directory, meta_params);
hmdp.T = T;
hmdp.MIN_DICT_WORDS = 2;
hmdp.RUNS = 10;

#hmdp.run();

hmdp.map_from_JSON();

SyntaxError: invalid syntax (<ipython-input-16-6cf3c8e3be14>, line 389)

opening folder /home/c/work/topicmodels/food_test//output_HMDP/10/clusters_0_geojson:
processing topic_0.geojson ...
processing topic_1.geojson ...
processing topic_2.geojson ...
processing topic_3.geojson ...
processing topic_4.geojson ...
processing topic_5.geojson ...
processing topic_6.geojson ...
processing topic_7.geojson ...
processing topic_8.geojson ...
processing topic_9.geojson ...
processing topic_10.geojson ...
processing topic_11.geojson ...
processing topic_12.geojson ...
processing topic_13.geojson ...
processing topic_14.geojson ...
processing topic_15.geojson ...
processing topic_16.geojson ...
processing topic_17.geojson ...
processing topic_18.geojson ...
processing topic_19.geojson ...
processing topic_20.geojson ...
processing topic_21.geojson ...
processing topic_22.geojson ...
processing topic_23.geojson ...
processing topic_24.geojson ...
created map in: /home/c/work/topicmodels/food_test//output_HMDP/10/clusters_0_geojson/topic_map.htm
