# load all the required packages

In [1]:
import logging
import time
import datetime
import pandas as pd
import ast
from plotly.offline import init_notebook_mode
from common.functions import find_curve_elbow_idx_based_on_max_dist, normalize_dict
from socialtree_functions import visualize_trees_and_graphs
from config import config
init_notebook_mode(connected=True)

In [2]:
# _logger = logging.getLogger()
# assert len(_logger.handlers) == 1
# handler = _logger.handlers[0]
# handler.setLevel(logging.DEBUG)

configure the logger to propagate the messages to the notebook using a workaround shown on https://stackoverflow.com/a/50670775/2262424

In [3]:
# workaround via specifying an invalid value first
%config Application.log_level='WORKAROUND'  # this will throw an ERROR, don't worry, it's part of the plan
%config Application.log_level='DEBUG'
logging.getLogger().setLevel(logging.DEBUG)
_logger = logging.getLogger()

ERROR:root:The 'log_level' trait of an IPKernelApp instance must be any of (0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL'), but a value of 'WORKAROUND' <class 'str'> was specified.


# retrieve articles

This part is coming soon

# load the retrieved articles and get the relevant articles to be summarized

define the query

In [4]:
story_id = "independence"  # this is just an id... you can use any random string

load (or retrieve) the articles

In [5]:
tagged_articles_df = pd.read_csv("../data/%s_ranked_articles.csv" % str(story_id))

In [6]:
print(tagged_articles_df.shape)
tagged_articles_df.head()

(3251, 6)


Unnamed: 0,url,t,epoch,hashtag_profile,tag_profile,score
0,http://www.irishtimes.com/news/world/uk/sturge...,2016-05-09T13:50:50.253590+00:00,1462802000.0,{'#scotland': 1.0},"{'#scotland': 0.75, 'holyrood': 0.05, 'nicola_...",0.002277
1,http://www.irishtimes.com/news/ireland/irish-n...,2016-08-14T16:31:23.749013+00:00,1471192000.0,{},"{'cork': 0.05, 'independence_museum': 0.05, 'k...",0.002154
2,http://www.irishtimes.com/news/world/europe/pr...,2015-09-27T21:55:00+00:00,1443391000.0,"{'#catalonia': 0.5196211096075778, '#spain': 0...","{'#catalonia': 0.38971583220568334, '#spain': ...",0.002107
3,http://www.irishtimes.com/news/world/uk/scotti...,2015-11-23T14:06:00+00:00,1448288000.0,{},"{'group': 0.05, 'independence': 0.05, 'scottis...",0.002039
4,http://www.irishtimes.com/news/world/uk/scotti...,2015-11-23T13:54:00+00:00,1448287000.0,{},"{'group': 0.05, 'independence': 0.05, 'scottis...",0.001993


As we don't have an annotated ground truth set of articles relevant to each query, we will filter out the articles with negligibly small relevance to the query and will assume that all the remaining articles are relevant.
***
find the article cutoff location based on an elbow of the cumulative distribution exponent...

In [7]:
article_scores = sorted(tagged_articles_df['score'].tolist(), reverse=True)
article_rel_cutoff = article_scores[find_curve_elbow_idx_based_on_max_dist(article_scores)] / max(article_scores)
del article_scores

In [8]:
if tagged_articles_df.shape[0] > config.MIN_N_ARTICLES_FOR_ELBOW_CUT:
    article_rel_cutoff_score = min(
        sorted(
            tagged_articles_df['score'].tolist(), reverse=True
        )[min(config.MIN_N_ARTICLES_FOR_ELBOW_CUT, tagged_articles_df.shape[0]) - 1],
        article_rel_cutoff * tagged_articles_df['score'].max()
    )
    summary_articles_df = tagged_articles_df[
        tagged_articles_df['score'] >= article_rel_cutoff_score
    ].copy()
else:
    # this makes sense especially when the query is matched only on headlines or pseudoarticles
    summary_articles_df = tagged_articles_df.copy()

In [9]:
if config.EXPORT_RELEVANT_ARTICLES_TO_CSV_FILENAME:
    summary_articles_df.to_csv(
        config.EXPORT_RELEVANT_ARTICLES_TO_CSV_FILENAME % str(story_id), header=True, index=False
    )

re-normalizing the article scores after the junk is cut off

In [10]:
summary_articles_df['score'] = summary_articles_df[['score']].apply(lambda x: x / x.sum())

convert the fields to their correct types

In [11]:
summary_articles_df.loc[summary_articles_df['hashtag_profile'].notnull(), 'hashtag_profile'] = \
    summary_articles_df[summary_articles_df['hashtag_profile'].notnull()]['hashtag_profile'].map(ast.literal_eval)
summary_articles_df.loc[summary_articles_df['tag_profile'].notnull(), 'tag_profile'] = \
    summary_articles_df[summary_articles_df['tag_profile'].notnull()]['tag_profile'].map(ast.literal_eval)

In [12]:
summary_articles_df['epoch'] = summary_articles_df['epoch'].astype(int)
summary_articles_df['t'] = pd.to_datetime(summary_articles_df['t']).map(lambda t: t.to_pydatetime())

create supporting fields

In [13]:
summary_articles_df['tags_list'] = summary_articles_df['tag_profile'].map(lambda p: list(p.keys()))

create the missing fields

In [14]:
summary_articles_df['keyword_profile'] = summary_articles_df['tag_profile'].map(lambda p: normalize_dict(dict((t, s) for (t, s) in p.items() if "#" not in t)))

In [15]:
summary_articles_df['keywords_list'] = summary_articles_df['keyword_profile'].map(lambda p: list(p.keys()))

in case if there is no <tt>'id'</tt> field in the article dataframe, create one

In [16]:
summary_articles_df['id'] = summary_articles_df.index

In [17]:
print(summary_articles_df.shape)
summary_articles_df.head()

(384, 10)


Unnamed: 0,url,t,epoch,hashtag_profile,tag_profile,score,tags_list,keyword_profile,keywords_list,id
0,http://www.irishtimes.com/news/world/uk/sturge...,2016-05-09 13:50:50.253590+00:00,1462801850,{'#scotland': 1.0},"{'#scotland': 0.75, 'holyrood': 0.05, 'nicola_...",0.005394,"[#scotland, holyrood, nicola_sturgeon, parliam...","{'holyrood': 0.2, 'nicola_sturgeon': 0.2, 'par...","[holyrood, nicola_sturgeon, parliament, scotla...",0
1,http://www.irishtimes.com/news/ireland/irish-n...,2016-08-14 16:31:23.749013+00:00,1471192283,{},"{'cork': 0.05, 'independence_museum': 0.05, 'k...",0.005103,"[cork, independence_museum, kilmurry, presiden...","{'cork': 0.2, 'independence_museum': 0.2, 'kil...","[cork, independence_museum, kilmurry, presiden...",1
2,http://www.irishtimes.com/news/world/europe/pr...,2015-09-27 21:55:00+00:00,1443390900,"{'#catalonia': 0.5196211096075778, '#spain': 0...","{'#catalonia': 0.38971583220568334, '#spain': ...",0.004993,"[#catalonia, #spain, catalonia, election, gove...","{'catalonia': 0.2, 'election': 0.2, 'governmen...","[catalonia, election, government, parties, spain]",2
3,http://www.irishtimes.com/news/world/uk/scotti...,2015-11-23 14:06:00+00:00,1448287560,{},"{'group': 0.05, 'independence': 0.05, 'scottis...",0.004831,"[group, independence, scottish, tens, women]","{'group': 0.2, 'independence': 0.2, 'scottish'...","[group, independence, scottish, tens, women]",3
4,http://www.irishtimes.com/news/world/uk/scotti...,2015-11-23 13:54:00+00:00,1448286840,{},"{'group': 0.05, 'independence': 0.05, 'scottis...",0.004722,"[group, independence, scottish, tens, women]","{'group': 0.2, 'independence': 0.2, 'scottish'...","[group, independence, scottish, tens, women]",4


at this stage we have the set of relevant articles and we'll proceed with summary extraction

# extract a SocialTree

In [18]:
story_info = ""
if config.CREATE_SOCIALTREE_FLAG:
    tttt = time.time()
    timing_marks = [('start', time.time())]
    tag_score_dict, timing_marks, story_info, tag_recall_div, tag_profile_div, \
    tag_support_div, g_div, t_plain_div, t_days_div, t_tag_profile_div, t_basic_div, t_simple_div, t_div = \
        visualize_trees_and_graphs(
            story_id=story_id,
            articles_df=summary_articles_df.copy(),
            timing_marks=timing_marks,
            story_info=story_info,
            tag_profile_field_name=config.SOCIALTREE_TAG_PROFILE_FIELD,
            tag_blacklist=config.HASHTAG_BLACKLIST + config.KEYWORD_BLACKLIST,
            redundancy_conf_thres=config.TAG_REDUNDANCY_CONF_THRES,
            lexical_redundancy_prefix=config.TAG_LEXICAL_REDUNDANCY_PREFIX,
            lexical_redundancy_suffix=config.TAG_LEXICAL_REDUNDANCY_SUFFIX,
            min_profile_size=config.MIN_TAG_PROFILE_SIZE,
            plot_stats_flag=config.PLOT_TAG_PROFILE_STATS_FLAG,
            tag_list_field_name=config.SOCIALTREE_TAG_LIST_FIELD,
            tag_field_blacklist=config.HASHTAG_BLACKLIST + config.KEYWORD_BLACKLIST,
            visualize_tree_flag=config.CREATE_SOCIALTREE_FLAG,
            importance_weighting_mode=config.TAG_IMPORTANCE_WEIGHTING_MODE,
            story_distance_resolution=config.TAG_SUBSTORY_DISTANCE_RESOLUTION,
            pattern_type=config.TAG_PATTERN_TYPE,
            min_pattern_support=config.MIN_TAG_PATTERN_SUPPORT,
            visualize_intermediate_trees_flag=config.VISUALIZE_INTERMEDIATE_TREES_FLAG
        )
    _logger.debug("SocialTree extraction took %.4f seconds" % (time.time() - tttt))
    del tttt
    _logger.info(
        "--- the tag profile of the story is %s ---" % sorted(
            [(h, round(s, 4)) for h, s in tag_score_dict.items()],
            key=lambda x: x[1], reverse=True
        )
    )
    _logger.info("#" * 29 + " CREATED A SOCIALTREE " + "#" * 29 + "\n")
else:
    tag_recall_div, tag_profile_div = "", ""
    tag_support_div, g_div, t_plain_div, t_days_div, t_tag_profile_div, t_basic_div, t_simple_div, t_div = \
        "", "", "", "", "", "", "", ""

INFO:socialtree_functions:-------------------------- EXTRACTING THE TAG PROFILE --------------------------
INFO:profile_functions:started replacing the lexiacally redundant tags
INFO:profile_functions:--- started the story tag profile extraction ---
INFO:profile_functions:initially there were 1007 tags selected for story's profile tag_profile
INFO:common.pattern_functions:read the rules from 'patterns/story_patterns_independence_tag_profile_rules.out' and produced a df of shape (3685, 11)
INFO:common.pattern_functions:found 228 redundant sets of items with 100% conditional probability of co-occurrence on both directions,
as a result, 475 items will be removed from the item profile and later will be added to their co-occurring items
INFO:profile_functions:after the redundancy removal, 532 tags compose story's profile tag_profile
INFO:profile_functions:the cuttoff point should be ('#hong', 0.006813480696078762) based on max dist
INFO:profile_functions:--- finished the story tag_profile p

DEBUG:common.graph_functions:the nodes with the highest (not mutually exclusive) cumulative importance overlaps weighted by 'bet_cent' are [('#independence', -1.7434433461054457), ('#eu', -0.48006094790369547), ('#uk', -0.28716729012962955), ('#spain$', -0.229758581750066), ('#scotland$', -0.21733956256669212)]
INFO:common.graph_functions:--- extracted the time-aware_query-aware MST in 0.074 seconds ---


INFO:common.graph_functions:--- plotted the time-aware_query-aware MST in 2.467 seconds (5.3 seconds since start) ---
INFO:socialtree_functions:------------------------------ CREATED A TAG TREE ------------------------------
DEBUG:root:SocialTree extraction took 5.3483 seconds
INFO:root:--- the tag profile of the story is [('#spain$', 0.0917), ('#catalonia$', 0.0786), ('#brexit', 0.0733), ('#scotland$', 0.0519), ('#catalan$', 0.041), ('#indyref2', 0.0374), ('#hongkong', 0.0226), ('#independence', 0.0202), ('#china$', 0.0191), ('#barcelona', 0.012), ('#uk', 0.0118), ('#eu', 0.0105), ('#eritrea', 0.0105), ('#taiwan', 0.01), ('#snp$', 0.0097), ('#bangladesh', 0.0084), ('#ireland$', 0.0078), ('nicola_sturgeon', 0.0073), ('#sports', 0.007), ('#hong', 0.0068)] ---
INFO:root:############################# CREATED A SOCIALTREE #############################



# extract a KeywordTree

In [19]:
if config.CREATE_KEYWORDTREE_FLAG:
    _logger.info("#" * 28 + " CREATING A KEYWORDTREE " + "#" * 28)
    tttt = time.time()
    keyword_score_dict, timing_marks, story_info, keyword_recall_div, keyword_profile_div, \
    k_support_div, g_div_k, t_plain_div_k, t_days_div_k, t_tag_profile_div_k, t_basic_div_k, t_simple_div_k, t_div_k = \
        visualize_trees_and_graphs(
            story_id=story_id,
            articles_df=summary_articles_df.copy(),
            timing_marks=timing_marks,
            story_info=story_info,
            tag_profile_field_name=config.KEYWORDTREE_TAG_PROFILE_FIELD,
            tag_blacklist=config.KEYWORD_BLACKLIST,
            redundancy_conf_thres=config.KEYWORD_REDUNDANCY_CONF_THRES,
            lexical_redundancy_prefix=config.KEYWORD_LEXICAL_REDUNDANCY_PREFIX,
            lexical_redundancy_suffix=config.KEYWORD_LEXICAL_REDUNDANCY_SUFFIX,
            min_profile_size=config.MIN_KEYWORD_PROFILE_SIZE,
            plot_stats_flag=config.PLOT_KEYWORD_PROFILE_STATS_FLAG,
            tag_list_field_name=config.KEYWORDTREE_TAG_LIST_FIELD,
            tag_field_blacklist=config.KEYWORD_BLACKLIST,
            visualize_tree_flag=config.CREATE_KEYWORDTREE_FLAG,
            importance_weighting_mode=config.KEYWORD_IMPORTANCE_WEIGHTING_MODE,
            story_distance_resolution=config.KEYWORD_SUBSTORY_DISTANCE_RESOLUTION,
            pattern_type=config.KEYWORD_PATTERN_TYPE,
            min_pattern_support=config.MIN_KEYWORD_PATTERN_SUPPORT,
            visualize_intermediate_trees_flag=config.VISUALIZE_INTERMEDIATE_TREES_FLAG
        )
    _logger.debug("KeywordTree extraction took %.4f seconds" % (time.time() - tttt))
    del tttt
    _logger.info("#" * 28 + " CREATED A KEYWORDTREE " + "#" * 29 + "\n")

INFO:root:############################ CREATING A KEYWORDTREE ############################
INFO:socialtree_functions:-------------------------- EXTRACTING THE TAG PROFILE --------------------------
INFO:profile_functions:started replacing the lexiacally redundant tags
INFO:profile_functions:--- started the story tag profile extraction ---
INFO:profile_functions:initially there were 939 tags selected for story's profile keyword_profile
INFO:common.pattern_functions:read the rules from 'patterns/story_patterns_independence_keyword_profile_rules.out' and produced a df of shape (2996, 11)
INFO:common.pattern_functions:found 220 redundant sets of items with 100% conditional probability of co-occurrence on both directions,
as a result, 426 items will be removed from the item profile and later will be added to their co-occurring items
INFO:profile_functions:after the redundancy removal, 513 tags compose story's profile keyword_profile
INFO:profile_functions:the cuttoff point should be ('elect

DEBUG:common.graph_functions:the nodes with the highest (not mutually exclusive) cumulative importance overlaps weighted by 'bet_cent' are [('catalan$', -7.497104450452596), ('catalonia$', -7.4294857980441495), ('spain$', -3.646467915396455), ('brexit', -2.1682346573955815), ('independence', -1.4519101916245192)]
INFO:common.graph_functions:--- extracted the time-aware_query-aware MST in 0.092 seconds ---


INFO:common.graph_functions:--- plotted the time-aware_query-aware MST in 4.260 seconds (12.8 seconds since start) ---
INFO:socialtree_functions:------------------------------ CREATED A TAG TREE ------------------------------
DEBUG:root:KeywordTree extraction took 7.4778 seconds
INFO:root:############################ CREATED A KEYWORDTREE #############################



***
<br>
<br>
<center><font size="28pt" color="red">the rest of the code requires crawled articles</font></center>
<br>
<center><font size="28pt">The crawling script will be uploaded in later releases of the code</font></center>
<br>
<br>

# extract a TFIDFTree

# extract a WordCloud

# extract a KeyGraph

# extract a MetroMap