This notebook allows to explore the results of predicting the `click_rate` from `source_article` to `target_article` using different models (Doc2Vec, Wikipedia2Vec, Smash-RNN Paragraph Level, Smash-RNN Sentence Level and Smash-RNN Word Level).

The class `ResultsAnalyzer` encapsules the logic to compute the results. Main features:
- `get_ndcg_for_all_models`: Calculates the Normalized Discounted Cumulative Gain for each model
- `get_map_for_all_models`: Calculates the Mean Average Precision for each model
- `get_top_5_predicted_by_article_and_model(source_article, model)`: Gets the top 5 predictions for the `source_article`. The column `is_in_top_5` shows if the `target_article` is in the **actual** top 5 click rate.
- `ResultsAnalyzer.results`: It is a Pandas Datafram containing the consolidated results
- `get_sample_source_articles`: Samples 10 random `source_articles`. Can be used to manually check the results

In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns

from results_analyzer import ResultsAnalyzer

pd.options.display.float_format = '{:,.4f}'.format
results_analyzer = ResultsAnalyzer()

In /Users/dnascimentodepau/anaconda3/envs/thesis-davi/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/dnascimentodepau/anaconda3/envs/thesis-davi/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/dnascimentodepau/anaconda3/envs/thesis-davi/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /Users/dnascimentodepau/anaconda3/envs/thesis-davi/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotli

In [2]:
WORD_COUNT_BIN = "word_count_bin"
WORD_COUNT_COLUMN = "word_count"
OUT_LINKS_BIN = "out_links_bin"
OUT_LINKS_COLUMN= "out_links_count"
IN_LINKS_BIN = "in_links_bin"
IN_LINKS_COLUMN = "in_links_count"
PARAGRAPH_COUNT_COLUMN = "paragraph_count"
PARAGRAPH_COUNT_BIN = "paragraph_count_bin"
SENTENCE_COUNT_COLUMN = "sentence_count"
SENTENCE_COUNT_BIN = "sentence_count_bin"
MISSING_WORDS_COLUMN = "missing_words_percentage"
MISSING_WORDS_BIN = "missing_words_percentage_bin"
MODEL_COLUMN = "model"

ALL_FEATURES = [WORD_COUNT_COLUMN, OUT_LINKS_COLUMN, IN_LINKS_COLUMN]

DOC2VEC_SIAMESE = "doc2vec_siamese"
DOC2VEC_COSINE = "doc2vec_cosine"
WIKIPEDIA2VEC_SIAMESE = "wikipedia2vec_siamese"
WIKIPEDIA2VEC_COSINE = "wikipedia2vec_cosine"
SMASH_WORD_LEVEL = "smash_word_level"
SMASH_SENTENCE_LEVEL = "smash_sentence_level"
SMASH_PARAGRAPH_LEVEL = "smash_paragraph_level"
SMASH_WORD_LEVEL_INTRODUCTION = "smash_word_level_introduction"
SMASH_SENTENCE_LEVEL_INTRODUCTION = "smash_sentence_level_introduction"
SMASH_PARAGRAPH_LEVEL_INTRODUCTION = "smash_paragraph_level_introduction"

ALL_MODELS = [DOC2VEC_SIAMESE, 
              DOC2VEC_COSINE, 
              WIKIPEDIA2VEC_SIAMESE, 
              WIKIPEDIA2VEC_COSINE, 
              SMASH_WORD_LEVEL, 
              SMASH_SENTENCE_LEVEL,
              SMASH_PARAGRAPH_LEVEL, 
              SMASH_WORD_LEVEL_INTRODUCTION,
              SMASH_SENTENCE_LEVEL_INTRODUCTION,
              SMASH_PARAGRAPH_LEVEL_INTRODUCTION]

INTRODUCTION_MODELS = [SMASH_WORD_LEVEL_INTRODUCTION,
                       SMASH_SENTENCE_LEVEL_INTRODUCTION,
                       SMASH_PARAGRAPH_LEVEL_INTRODUCTION]

COMPLETE_MODELS = [DOC2VEC_SIAMESE, 
                   WIKIPEDIA2VEC_SIAMESE,  
                   SMASH_WORD_LEVEL, 
                   SMASH_SENTENCE_LEVEL,
                   SMASH_PARAGRAPH_LEVEL]

COMPLETE_MODELS_SAVE_CONFIG = [
    (PARAGRAPH_COUNT_COLUMN, "Source article length as paragraph count (%s equal-sized buckets)"),
    (SENTENCE_COUNT_COLUMN, "Source article length as sentence count (%s equal-sized buckets)"),
    (WORD_COUNT_COLUMN, "Source article length as word count (%s equal-sized buckets)"),
    (OUT_LINKS_COLUMN, "Number of links present in the source articles (%s equal-sized buckets)"),
    (IN_LINKS_COLUMN, "Number of articles with links pointing to the source articles (%s equal-sized buckets)"),
    (MISSING_WORDS_COLUMN, "Percentage of missing words in GloVe (%s equal-sized buckets)")
]

SMASH_MODELS = [SMASH_WORD_LEVEL, 
                SMASH_SENTENCE_LEVEL,
                SMASH_PARAGRAPH_LEVEL,]

SMASH_AND_INTRODUCTION_MODELS = [SMASH_WORD_LEVEL, 
SMASH_SENTENCE_LEVEL,
SMASH_PARAGRAPH_LEVEL,
                                SMASH_WORD_LEVEL_INTRODUCTION,
SMASH_SENTENCE_LEVEL_INTRODUCTION,
SMASH_PARAGRAPH_LEVEL_INTRODUCTION]

COSINE_MODELS = [DOC2VEC_SIAMESE, 
                 WIKIPEDIA2VEC_SIAMESE,
                 DOC2VEC_COSINE, 
                 WIKIPEDIA2VEC_COSINE]

BEST_MODELS = [DOC2VEC_SIAMESE,
               SMASH_WORD_LEVEL,
               WIKIPEDIA2VEC_SIAMESE]

CLEAN_MODEL_NAMES = {
    DOC2VEC_SIAMESE: "Doc2Vec",
    DOC2VEC_COSINE: "Doc2Vec Cosine",
    WIKIPEDIA2VEC_SIAMESE: "Wikipedia2Vec", 
    WIKIPEDIA2VEC_COSINE: "Wikipedia2Vec Cosine", 
    SMASH_WORD_LEVEL: "SMASH RNN (P + S + W)", 
    SMASH_SENTENCE_LEVEL: "SMASH RNN (P + S)",
    SMASH_PARAGRAPH_LEVEL: "SMASH RNN (P)", 
    SMASH_WORD_LEVEL_INTRODUCTION: "SMASH RNN Introduction (P + S + W)",
    SMASH_SENTENCE_LEVEL_INTRODUCTION: "SMASH RNN Introduction (P + S)",
    SMASH_PARAGRAPH_LEVEL_INTRODUCTION: "SMASH RNN Introduction (P)",
}


Getting NDCG for all models:

In [3]:
_results = results_analyzer.calculate_statistics_per_article()

[2020-11-13 17:27:50,458] [INFO] Getting features from DB (calculate_statistics_per_article@results_analyzer.py:407)
[2020-11-13 17:28:02,456] [INFO] Getting predictions by model (calculate_statistics_per_article@results_analyzer.py:426)
[2020-11-13 17:28:02,466] [INFO] Aggregating predictions for each model (get_predictions_by_model@results_analyzer.py:246)
100%|██████████| 10/10 [00:07<00:00,  1.39it/s]
[2020-11-13 17:28:09,687] [INFO] Calculating results by model (calculate_statistics_per_article@results_analyzer.py:435)
100%|██████████| 474/474 [00:04<00:00, 115.30it/s]
100%|██████████| 474/474 [00:04<00:00, 115.33it/s]
100%|██████████| 474/474 [00:03<00:00, 119.98it/s]
100%|██████████| 474/474 [00:03<00:00, 121.29it/s]
100%|██████████| 474/474 [00:03<00:00, 119.25it/s]
100%|██████████| 474/474 [00:03<00:00, 119.17it/s]
100%|██████████| 474/474 [00:03<00:00, 121.22it/s]
100%|██████████| 474/474 [00:03<00:00, 122.80it/s]
100%|██████████| 474/474 [00:04<00:00, 118.15it/s]
100%|██████

In [None]:
results_ks = results_analyzer.calculate_statistics_per_model_different_k(ks=[1, 3, 5, 10])

In [None]:
# print(results_ks[results_ks["model"].isin(COMPLETE_MODELS)].pivot(index="model", columns="k", values="map").reset_index().to_latex(index=False))
results_ks[results_ks["model"].isin(COMPLETE_MODELS)].pivot(index="model", columns="k", values="ndcg")

In [3]:
results_per_model = results_analyzer.calculate_statistics_per_model()

[2020-11-09 18:51:33,497] [INFO] Aggregating predictions for each model (get_predictions_by_model@results_analyzer.py:246)
100%|██████████| 10/10 [00:07<00:00,  1.36it/s]
[2020-11-09 18:51:40,862] [INFO] Calculating results by model (calculate_statistics_per_model@results_analyzer.py:335)
100%|██████████| 474/474 [00:04<00:00, 115.57it/s]
100%|██████████| 474/474 [00:04<00:00, 98.44it/s] 
100%|██████████| 474/474 [00:04<00:00, 109.47it/s]
100%|██████████| 474/474 [00:04<00:00, 99.06it/s] 
100%|██████████| 474/474 [00:04<00:00, 109.19it/s]
100%|██████████| 474/474 [00:04<00:00, 110.26it/s]
100%|██████████| 474/474 [00:04<00:00, 109.58it/s]
100%|██████████| 474/474 [00:04<00:00, 106.17it/s]
100%|██████████| 474/474 [00:04<00:00, 115.83it/s]
100%|██████████| 474/474 [00:04<00:00, 109.66it/s]


In [15]:
def get_clean_results(model_results, selected_models = COMPLETE_MODELS):
    clean_results = model_results[model_results["model"].isin(selected_models)].copy()
    
    clean_results.columns = ["Model", "NDCG@5", "MAP@5", "Precision@5"]
    
    return clean_results

res = get_clean_results(results_per_model, SMASH_AND_INTRODUCTION_MODELS)

print(res)

                                Model  NDCG@5  MAP@5  Precision@5
2               smash_paragraph_level  0.4895 0.6305       0.4658
3  smash_paragraph_level_introduction  0.4724 0.6020       0.4570
4                smash_sentence_level  0.4769 0.6270       0.4492
5   smash_sentence_level_introduction  0.4728 0.6055       0.4570
6                    smash_word_level  0.4972 0.6416       0.4700
7       smash_word_level_introduction  0.4818 0.6217       0.4593


In [80]:
_results[_results["smash_word_level"] <= 0.2][["source_article", "wikipedia2vec_siamese", "smash_word_level"]].sample(n=30)

# Princess Victoria Louise of Prussia

Unnamed: 0,source_article,wikipedia2vec_siamese,smash_word_level
177,History of Japan,0.2318,0.0734
152,Gary Oldman,0.2201,0.1799
271,List of years in home video,0.2394,0.0663
247,List of The Vampire Diaries characters,0.1389,0.1478
132,Eton College,0.1635,0.1389
333,Pol Pot,0.3786,0.137
461,Warship,0.4011,0.1546
464,West Bromwich Albion F.C.,0.3024,0.0694
206,Jeremy Clarkson,0.4441,0.11
292,Michael Biehn,0.4074,0.1682


In [69]:
a = pd.read_csv("./results/test/results_smash_word_level.csv")
a.sample(n=10)

Unnamed: 0,model,source_article,target_article,actual_click_rate,predicted_click_rate
17310,smash_word_level,Ruby Ridge,Rainbow Farm,0.0096,0.0189
3112,smash_word_level,Ghost Adventures,Echo Bridge Home Entertainment,0.0,0.0288
2014,smash_word_level,2020 coronavirus pandemic in the United States,Ebola virus disease,0.0003,0.0085
2294,smash_word_level,First Mexican Empire,History of Belize (1506–1862),0.0183,0.0079
4115,smash_word_level,The Spinners (American R&B group),Working My Way Back to You,0.0228,0.0111
2680,smash_word_level,Penicillin,University of Oxford,0.0,-0.0006
14869,smash_word_level,Tony Blair,Prime Minister's Questions,0.0,0.005
10648,smash_word_level,List of cities in India by population,Pune,0.0146,0.0155
13891,smash_word_level,The Maze Runner (film),Chris Sheffield,0.0072,0.0207
5102,smash_word_level,Anne Rice,Anne Rice bibliography,0.0222,0.0187


In [78]:
a[a["source_article"] == "WWE"].nlargest(10, 'predicted_click_rate')

Unnamed: 0,model,source_article,target_article,actual_click_rate,predicted_click_rate
17147,smash_word_level,WWE,Capitol Wrestling Corporation,0.0088,0.0278
2833,smash_word_level,WWE,Glossary of professional wrestling terms,0.0012,0.0261
12381,smash_word_level,WWE,All Elite Wrestling,0.0021,0.0246
15723,smash_word_level,WWE,World Championship Wrestling,0.0029,0.0199
3497,smash_word_level,WWE,Under Armour,0.0,0.019
3636,smash_word_level,WWE,WWE Studios,0.0075,0.0168
15014,smash_word_level,WWE,National Wrestling Alliance,0.0014,0.0167
4474,smash_word_level,WWE,Professional wrestling promotion,0.0026,0.0161
675,smash_word_level,WWE,Syfy,0.0,0.016
13021,smash_word_level,WWE,Tapout (clothing brand),0.0084,0.0156


In [73]:
t = pd.read_csv("./data/dataset/wiki_articles_english_complete_bkp_2.csv")


Unnamed: 0,article,text_ids,raw_text,text_ids_intro
0,Anarchism,"[[[209, 926, 683], [580, 92, 802]], [[69, 682,...","[[['political', 'movement', 'form'], ['call', ...","[[[209, 926, 683], [580, 92, 802]], [[69, 682,..."
1,Autism,"[[[528]], [[659, 62, 964, 964, 659]], [[488, 4...","[[['visit']], [['social', 'year', 'child', 'ch...","[[[528]], [[659, 62, 964, 964, 659]], [[488, 4..."
2,Alabama,"[[[92, 368, 104, 112], [718, 193, 322, 950, 85...","[[['state', 'region', 'united', 'states'], ['b...","[[[92, 368, 104, 112], [718, 193, 322, 950, 85..."
3,Achilles,"[[[136, 353, 323], [630, 691]], [[136, 587], [...","[[['war', 'great', 'central'], ['son', 'king']...","[[[136, 353, 323], [630, 691]], [[136, 587], [..."
4,Abraham Lincoln,"[[[617, 396, 140, 90, 104, 112], [410, 513, 35...","[[['february', 'april', 'american', 'president...","[[[617, 396, 140, 90, 104, 112], [410, 513, 35..."
5,Aristotle,"[[[566], [164], [580, 629, 556], [488, 403, 78...","[[['period'], ['school'], ['call', 'father', '...","[[[566], [164], [580, 629, 556], [488, 403, 78..."
6,Academy Award for Best Production Design,"[[[254, 618, 760, 319], [929, 254, 760, 511, 5...","[[['best', 'production', 'art', 'film'], ['ori...","[[[254, 618, 760, 319], [929, 254, 760, 511, 5..."
7,Academy Awards,"[[[346, 319, 459], [454, 146, 538], [580], [76...","[[['know', 'film', 'industry'], ['given', 'int...","[[[346, 319, 459], [454, 146, 538], [580], [76..."
8,International Atomic Time,"[[[146, 79, 348, 152, 79, 815, 79, 79], [79, 7...","[[['international', 'time', 'french', 'high', ...","[[[146, 79, 348, 152, 79, 815, 79, 79], [79, 7..."
9,Altruism,"[[[473, 134, 712, 214], [305]], [[348, 348, 89...","[[['human', 'being', 'result', 'life'], ['case...","[[[473, 134, 712, 214], [305]], [[348, 348, 89..."


In [77]:
print(t[t["article"] == "List of European islands by area"]["raw_text"].tolist())

["[[['list', 'island', 'europe', 'order', 'area']], [['data', 'island', 'island', 'russia']]]"]


In [83]:
print(results_analyzer.get_article_results("24 Hours of Le Mans")[["actual click rate", "smash_word_level"]].to_latex())

\begin{tabular}{lll}
\toprule
{} &                    actual click rate &               smash\_word\_level \\
\midrule
0 &  List of 24 Hours of Le Mans winners &            24 Hours of Daytona \\
1 &       Tom Kristensen (racing driver) &       1955 24 Hours of Le Mans \\
2 &           Triple Crown of Motorsport &       1923 24 Hours of Le Mans \\
3 &                 Circuit de la Sarthe &                   Grand tourer \\
4 &             2019 24 Hours of Le Mans &             24 Hours of LeMons \\
5 &                     Indianapolis 500 &       2019 24 Hours of Le Mans \\
6 &                              Le Mans &       2010 24 Hours of Le Mans \\
7 &                            Ford GT40 &               Indianapolis 500 \\
8 &                         Joest Racing &           24 Hours Nürburgring \\
9 &                              Porsche &  Endurance racing (motorsport) \\
\bottomrule
\end{tabular}



In [None]:
def get_latex(models):
    clean_names = {k: v for k, v in CLEAN_MODEL_NAMES.items() if k in models}
    models_results = _results[models].mean()
    models_results.rename(index=clean_names, inplace=True)
    
    print(models_results.to_latex())
    
get_latex(SMASH_MODELS)

In [12]:
SMASH_HATCH = '//'
DOC2VEC_HATCH = 'X' 
WIKIPEDIA2VEC_HATCH = '.'

system_styles = {
    'doc2vec_siamese': dict(color='#b2abd2', hatch=DOC2VEC_HATCH),
    'wikipedia2vec_siamese': dict(color='#e66101', hatch=WIKIPEDIA2VEC_HATCH),
    'smash_paragraph_level': dict(color='#abdda4'),
    'smash_sentence_level': dict(color='#fdae61'),
    'smash_word_level': dict(color='#2b83ba'),
}

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE+1)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE+1)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE+1)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rc('pdf', fonttype=42)
plt.rc('ps', fonttype=42)

plt.rc('text', usetex=False)
plt.rc('font', family='serif')

def get_performance_figure(
    results,
    models,
    feature_column,
    x_label,
    y_label=None,
    figsize=(13, 6),
    legend_columns_count=3,
    buckets_count=5,
    save_file_name=None,
):
    bin_column = f"{feature_column}_bin"
    bins = pd.qcut(results[feature_column], q=buckets_count)

    results[bin_column] = bins
    result_by_model = results.groupby([bin_column]).mean()[models]
    print(result_by_model)

#     fig = plt.figure(figsize=figsize)

#     ax = result_by_model.plot(
#         kind="bar", ax=fig.gca(), rot=0, width=0.7, alpha=0.9, edgecolor=["black"],
#     )

#     box = ax.get_position()
#     ax.set_position([box.x0, box.y0 + box.height * 0.25, box.width, box.height * 0.75])

#     # Formats the bars
#     for container in ax.containers:
#         container_system = container.get_label()
        
#         style = system_styles[container_system]
#         for patch in container.patches:
#             if 'color' in style:
#                 patch.set_color(style['color'])
#             if 'hatch' in style:
#                 patch.set_hatch(style['hatch'])
#             if 'linewidth' in style:
#                 patch.set_linewidth(style['linewidth'])
#             if 'edgecolor' in style:
#                 patch.set_edgecolor(style['edgecolor'])
#             else:
#                 patch.set_edgecolor('black')

    
#     model_names = [CLEAN_MODEL_NAMES[model] for model in selected_models]

#     ax.legend(
#         model_names,
#         ncol=legend_columns_count,
#         loc="upper center",
#         fancybox=True,
#         shadow=False,
#         bbox_to_anchor=(0.5, 1.2),
#     )

#     # Formats the x label as "(lower, upper]"
#     ax.set_xticklabels(
#         [f"({int(i.left)}, {int(i.right)}]" for i in bins.cat.categories]
#     )

#     y_label = "NDCG@10"
#     ax.set_xlabel(x_label % len(result_by_model))
#     ax.set_ylabel(y_label)
    
#     if save_file_name:
#         save_file_path = f"./results/figures/{save_file_name}.png"
#         pdf_dpi = 300

# #         logger.info(f"Saved to {save_file_path}")
#         plt.savefig(save_file_path, bbox_inches="tight", dpi=pdf_dpi)

#     plt.show()

In [21]:
selected_models = SMASH_MODELS
n_buckets = 8

# get_performance_figure(_results, selected_models, WORD_COUNT_COLUMN, "Text length as word count (%s equal-sized buckets)", buckets_count=n_buckets, save_file_name="best_models_word_count")
get_performance_figure(_results, selected_models, SENTENCE_COUNT_COLUMN, "Text length as sentence count (%s equal-sized buckets)", buckets_count=n_buckets, save_file_name="smash_sentence_count")
# get_performance_figure(_results, selected_models, PARAGRAPH_COUNT_COLUMN, "Text length as paragraph count (%s equal-sized buckets)", buckets_count=n_buckets, save_file_name="smash_paragraph_count")
# get_performance_figure(_results, 
#                        selected_models, 
#                        IN_LINKS_COLUMN, 
#                        "Number of articles with links pointing to the source article (%s equal-sized buckets)",
#                        buckets_count=n_buckets, 
#                        save_file_name="smash_in_links")


# results

                    smash_word_level  smash_sentence_level  \
sentence_count_bin                                           
(0.999, 14.0]                 0.5179                0.5083   
(14.0, 32.25]                 0.5440                0.5302   
(32.25, 51.0]                 0.5491                0.5382   
(51.0, 71.0]                  0.5366                0.5042   
(71.0, 101.625]               0.5023                0.4727   
(101.625, 149.0]              0.5055                0.4585   
(149.0, 256.0]                0.4495                0.4195   
(256.0, 1189.0]               0.3709                0.3819   

                    smash_paragraph_level  
sentence_count_bin                         
(0.999, 14.0]                      0.5104  
(14.0, 32.25]                      0.5217  
(32.25, 51.0]                      0.5592  
(51.0, 71.0]                       0.5111  
(71.0, 101.625]                    0.5060  
(101.625, 149.0]                   0.4785  
(149.0, 256.0]             

In [16]:
(.4746/.3516) - 1

0.34982935153583616

In [68]:
_results[_results["source_article"] == "Ireland"][["source_article", "smash_word_level"]]

Unnamed: 0,source_article,smash_word_level
191,Ireland,0.247


In [9]:
from scipy import stats
import random

ttest, pval = stats.ttest_rel(_results["smash_word_level"], _results["smash_paragraph_level"])

print(pval <= 0.01)


False


In [None]:
from itertools import zip_longest

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return list(zip_longest(*args))


In [None]:
def get_performance_figure_multiple_rows(
    results,
    models,
    feature_column,
    x_label,
    y_label=None,
    figsize=(15, 13),
    legend_columns_count=3,
    buckets_count=9,
    charts_per_row=3,
    save_file_name=None,
):
    bin_column = f"{feature_column}_bin"
    bins = pd.qcut(results[feature_column], q=buckets_count)

#     bins = pd.cut(results[feature_column], bins=[0, 0.05, 0.1, 0.25, 1])

    results[bin_column] = bins
    result_by_model = results.groupby([bin_column]).mean()[models]

    fig = plt.figure(figsize=figsize)
    
    groups = grouper(range(buckets_count), charts_per_row)
    
    results_by_group = []
    n_rows = len(groups)
    for index, group in enumerate(groups):
        fig.add_subplot(n_rows, 1, index + 1)
        ax = result_by_model.take(group).plot(kind="bar", ax=fig.gca(), rot=0, width=0.7, alpha=0.9, edgecolor=["black"], xlabel="")
        ax.get_legend().remove()
        
        # Formats the x label as "(lower, upper]"
        if feature_column == MISSING_WORDS_COLUMN:
            ax.set_xticklabels([f"({i.left:0.0%}, {i.right:0.0%}]" for i in bins.cat.categories[group[0]:(group[-1] + 1)]])
        else:
            ax.set_xticklabels([f"({int(i.left)}, {int(i.right)}]" for i in bins.cat.categories[group[0]:(group[-1] + 1)]])
        
        y_label = "NDCG@k (k=5)"
        ax.set_ylabel(y_label)

        # Formats the bars
        for container in ax.containers:
            container_system = container.get_label()

            style = system_styles[container_system]
            for patch in container.patches:
                if 'color' in style:
                    patch.set_color(style['color'])
                if 'hatch' in style:
                    patch.set_hatch(style['hatch'])
                if 'linewidth' in style:
                    patch.set_linewidth(style['linewidth'])
                if 'edgecolor' in style:
                    patch.set_edgecolor(style['edgecolor'])
                else:
                    patch.set_edgecolor('black')
                   
    ax.set_xlabel(x_label % len(result_by_model))
    
    top_limits = [axis.get_ylim()[1] for axis in fig.get_axes()]
    max_top_limit = max(top_limits)
    for axis in fig.get_axes():
        axis.set_ylim(top=max_top_limit)

    
    model_names = [CLEAN_MODEL_NAMES[model] for model in models]

    fig.legend(
        model_names,
        ncol=legend_columns_count,
        loc="upper center",
        fancybox=True,
        shadow=False,
        bbox_to_anchor=(0.5, 0.93),
    )
    
    if save_file_name:
        save_file_path = f"./results/figures/{save_file_name}.png"
        pdf_dpi = 300

#         logger.info(f"Saved to {save_file_path}")
        plt.savefig(save_file_path, bbox_inches="tight", dpi=pdf_dpi)

    plt.show()

In [None]:

get_performance_figure_multiple_rows(_results, 
                                     COMPLETE_MODELS, 
                                     IN_LINKS_COLUMN, 
                                     "Number of articles with links present in the source articles (%s equal-sized buckets)",
                                     buckets_count=20, 
                                     charts_per_row=4,
                                     save_file_name="in_links_count")

In [None]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8], [30, 50]],
     index=['cobra', 'cobra', 'viper', 'viper'],
     columns=['max_speed', 'shield'])
df.reset_index(inplace=True)

a = pd.concat({
    key: value.reset_index(drop=True) for key, value in df.groupby("index")["max_speed"]
}, axis=1)

a



In [None]:
_results.describe()

In [None]:
_results[_results["missing_words_percentage"].isin(_results["missing_words_percentage"].nlargest(5))][["source_article", WIKIPEDIA2VEC_SIAMESE,
         DOC2VEC_SIAMESE,
        SMASH_WORD_LEVEL, 
        SMASH_SENTENCE_LEVEL,
        SMASH_PARAGRAPH_LEVEL,]]

In [None]:
def get_correlation():
    m = [WIKIPEDIA2VEC_SIAMESE,
         DOC2VEC_SIAMESE,
        SMASH_WORD_LEVEL, 
        SMASH_SENTENCE_LEVEL,
        SMASH_PARAGRAPH_LEVEL,]
    
    filtered_results = _results
    
    for model in m:
        correlation = round(np.corrcoef(filtered_results["missing_words_percentage"], filtered_results[model])[0, 1],4)
        
        print(f"Correlation {CLEAN_MODEL_NAMES[model]}: {correlation}")
        
get_correlation()