In [1]:
#!/usr/bin/env python
import logging
from logging.handlers import TimedRotatingFileHandler
from logging import StreamHandler
import argparse
import traceback
from typing import *

from configuration.configuration import Configuration
from crawler.crawler import AbstractCrawler
from visualizer.visualizer import AbstractVisualizer
from crawler.parliament_members_crawler import ParliamentMembersCrawler
from visualizer.plotly_visualizer import PlotlyVisualizer
from pandas_manager.pandas_manager import PandasManager

logger = logging.getLogger('MAIN')


def __setup_log__(log_path: str, debug=False) -> None:
    from os import sep
    if log_path is None:
        raise Exception('No log path was provided!')

    log_path = log_path.split(sep)
    if len(log_path) > 1:
        from os import makedirs
        try:
            makedirs((sep.join(log_path[:-1])))
        except FileExistsError:
            pass
    log_filename = sep.join(log_path)
    time_rotating_handler = TimedRotatingFileHandler(log_filename, when='midnight', interval=1)
    stream_handler = StreamHandler()
    logging.basicConfig(level=logging.INFO if debug is not True else logging.DEBUG,
                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S'
                        )
    logger.addHandler(time_rotating_handler)
    logger.addHandler(stream_handler)


def __setup_classes__(config_file: str) -> Tuple[AbstractCrawler, AbstractVisualizer, str]:
    config = Configuration(config_file)
    if config.get_source_type() == 'ParliamentMembersCrawler':
        crawler = ParliamentMembersCrawler(config=config.get_source())
    else:
        raise Exception('Unknown source type!')
    if config.get_target_type() == 'plotly':
        visualizer = PlotlyVisualizer(config=config.get_target())
    else:
        raise Exception('Unknown source type!')

    return crawler, visualizer, config.get_plot_name()


In [2]:
# Setup
config_file = '../confs/greek_members.yml'
log_file = '../logs/greek_members.log'
debug = True

log_fn = log_file
__setup_log__(log_fn, debug)
crawler, visualizer, plot_name = __setup_classes__(config_file)
# Crawl wikipedia and retrieve the requested tables
df_generator = crawler.get_tables()
# Merge the retrieved tables and a create nodes, edges DataFrames
pandas_manager = PandasManager(df_generator=df_generator)
merged_df, plot_cols, name_col = pandas_manager.df_from_generator()
nodes_df = pandas_manager.create_nodes_df(merged_df, plot_cols)
edges_df = pandas_manager.create_edges_df(merged_df, plot_cols, name_col)
print(nodes_df)
print(edges_df)


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



       Full Name Kommata 2007               Kommata 2015               Kommata 2019
202  Al papariga          Kke  Communist party of greece  Communist party of greece
                               Node  Count
0                New democracy_2007    153
1                        Pasok_2007    102
2                         Laos_2007     10
3                       Syriza_2007     14
4                          Kke_2007     22
5                New democracy_2015     77
6   Pasok-democratic alignment_2015     13
7                       Syriza_2015    149
8           Independent greeks_2015     13
9    Communist party of greece_2015     15
10                   The river_2015     17
11                 Golden dawn_2015     17
12               New democracy_2019    158
13                      Syriza_2019     86
14   Communist party of greece_2019     15
15         Movement for change_2019     22
16                      Mera25_2019      9
17              Greek solution_2019     10
               

In [3]:
import plotly
import seaborn as sns


In [4]:
# add index for source-target pair
nodes_list = nodes_df['Node'].tolist()
nodes_count_list = nodes_df['Count'].tolist()
node_types = {node.split('_')[-1]: [] for node in nodes_list}
color_palette = list(sns.color_palette(None, len(node_types.keys())).as_hex())
for ind, node in enumerate(nodes_list):
    node_types[node.split('_')[-1]].append(ind)
print(node_types)
x_positions = [0 for _ in range(len(nodes_list))]
y_positions = [0 for _ in range(len(nodes_list))]
node_color_list = [0 for _ in range(len(nodes_list))]
x_position = 0.0
for ind_1, key in enumerate(sorted(node_types.keys())):
    y_position = 1.0
    for ind_2, node in sorted(enumerate(node_types[key]), key=lambda row: nodes_count_list[row[1]], reverse=False):
        node_color_list[node] = color_palette[ind_1]
        x_positions[node] = round(x_position, 3)
        y_positions[node] = round(y_position, 3)
        y_position -= 1.0/len(node_types[key])
    x_position += 1.0/len(node_types.keys())
print(len(nodes_list), nodes_list)
print(len(node_color_list), node_color_list)
print(len(x_positions), x_positions)
print(len(y_positions), y_positions)

{'2007': [0, 1, 2, 3, 4], '2015': [5, 6, 7, 8, 9, 10, 11], '2019': [12, 13, 14, 15, 16, 17]}
18 ['New democracy_2007', 'Pasok_2007', 'Laos_2007', 'Syriza_2007', 'Kke_2007', 'New democracy_2015', 'Pasok-democratic alignment_2015', 'Syriza_2015', 'Independent greeks_2015', 'Communist party of greece_2015', 'The river_2015', 'Golden dawn_2015', 'New democracy_2019', 'Syriza_2019', 'Communist party of greece_2019', 'Movement for change_2019', 'Mera25_2019', 'Greek solution_2019']
18 ['#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e', '#ff7f0e', '#ff7f0e', '#ff7f0e', '#ff7f0e', '#ff7f0e', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c']
18 [0.0, 0.0, 0.0, 0.0, 0.0, 0.333, 0.333, 0.333, 0.333, 0.333, 0.333, 0.333, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667]
18 [0.2, 0.4, 1.0, 0.8, 0.6, 0.286, 1.0, 0.143, 0.857, 0.714, 0.571, 0.429, 0.167, 0.333, 0.667, 0.5, 1.0, 0.833]


In [5]:
edges_df['SourceID'] = edges_df['Source'].apply(lambda x: nodes_list.index(x))
edges_df['TargetID'] = edges_df['Target'].apply(lambda x: nodes_list.index(x))
# print(edges_df)
edge_years = set([node.split('_')[-1] for node in nodes_list])
edge_types = dict(zip(sorted(edge_years), color_palette))
print(edge_types)
source_from_edges_list = edges_df['Source'].to_list()
edge_color_list = [edge_types[node.split('_')[-1]] for node in source_from_edges_list]


{'2007': '#1f77b4', '2015': '#ff7f0e', '2019': '#2ca02c'}


In [6]:
# creating the sankey diagram
data = dict(
    type='sankey',
    node=dict(
        hoverinfo="all",
        pad=15,
        thickness=20,
        line=dict(
            color="black",
            width=0.5
        ),
        label=nodes_list,
        color=node_color_list,
        x=x_positions,
        y=y_positions,
        # groups=list(node_types.values())
    ),
    link=dict(
        source=edges_df['SourceID'],
        target=edges_df['TargetID'],
        value=edges_df['Count'],
        label=edges_df['Count'],
        color = edge_color_list
    ),
    arrangement='freeform'
)

layout = dict(
    title='Test Sankey',
    font=dict(
        size=10
    )
)

fig = dict(data=[data], layout=layout)

filename = "sankey"

import chart_studio.plotly as py
py.sign_in('drkostas', 'nMT8XHbUvMWFXOR9LZ7s')
# py.iplot([data], filename=filename)
plotly.offline.plot(fig, validate=True, filename=filename)

2019-12-25 22:03:39 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): api.plot.ly:443
2019-12-25 22:03:42 urllib3.connectionpool DEBUG    https://api.plot.ly:443 "GET /v2/users/current HTTP/1.1" 200 None

Your filename `sankey` didn't end with .html. Adding .html to the end of your file.



'sankey.html'