In [35]:
import pandas as pd

In [None]:
!python3 util/precision-recall-in-top-1000.py

In [7]:
df_18 = pd.read_json('../../../tmp/web-track-18-precision-recall.jsonl', lines=True)

In [8]:
df_19 = pd.read_json('../../../tmp/web-track-19-precision-recall.jsonl', lines=True)

In [9]:
df_20 = pd.read_json('../../../tmp/web-track-20-precision-recall.jsonl', lines=True)

In [10]:
df_21 = pd.read_json('../../../tmp/web-track-21-precision-recall.jsonl', lines=True)

In [11]:
df_22 = pd.read_json('../../../tmp/web-track-22-precision-recall.jsonl', lines=True)

In [12]:
df_23 = pd.read_json('../../../tmp/web-track-23-precision-recall.jsonl', lines=True)

In [28]:
def precision_score(df, approach):
    from sklearn.metrics import precision_score
    return "{:0.3f}".format(precision_score(y_true=df['near-duplicate'], y_pred=df[approach]))

def recall_score(df, approach):
    from sklearn.metrics import recall_score
    return "{:0.3f}".format(recall_score(y_true=df['near-duplicate'], y_pred=df[approach]))

def table_row(df, approach, approach_display_name):
    ret = {'Approach': approach_display_name}

    for doc_count in [1000]:
        df_current_count = df[df['docs'] == doc_count]
        doc_count = str(doc_count)
        df_relevant = df_current_count[(df_current_count['judged']) & (df_current_count['relevant'])]
        df_irrelevant = df_current_count[(df_current_count['judged']) & (~df_current_count['relevant'])]

        ret['Precision (Top' + doc_count + ')'] = precision_score(df_current_count, approach)
        ret['Recall (Top' + doc_count + ')'] = recall_score(df_current_count, approach)
        ret['Precision (Relevant@Top' + doc_count + ')'] = precision_score(df_relevant, approach)
        ret['Recall (Relevant@Top' + doc_count + ')'] = recall_score(df_relevant, approach)
        ret['Precision (Irrelevant@Top' + doc_count + ')'] = precision_score(df_irrelevant, approach)
        ret['Recall (Irrelevant@Top' + doc_count + ')'] = recall_score(df_irrelevant, approach)
    
    return ret

def report_table(df):
    rows = []
    for approach, approach_display_name in [('copy-cat-tp', 'CopyCat'), ('url-simhash', 'Url Classes'), ('simhash(1-grams)', 'SimHash(1-grams)'), ('simhash(3+5-grams)', 'SimHash(3+5-grams)'), ('text-profile', 'TextProfile') , ('md5', 'MD5')]:
        rows += [table_row(df, approach, approach_display_name)]
    ret = pd.DataFrame(rows)
    ret.set_index('Approach', inplace=True)
    ret.columns = pd.MultiIndex.from_tuples([
        
        ('Top1000', 'Precision'), ('Top1000', 'Recall'),
        ('Relevant@Top1000', 'Precision'), ('Relevant@Top1000', 'Recall'),
        ('Irrelevant@Top1000', 'Precision'), ('Irrelevant@Top1000', 'Recall'),
    ])

    return ret.reset_index()

print('Precision/Recall with S3 score as ground-truth (small cw09 sample):')
df = pd.concat([df_18, df_19, df_20, df_21, df_22, df_23])
df['url-simhash'] = df['simhash(1-grams)'] & df['url']
df['docs'] = 1000
df = report_table(df)
df

Precision/Recall with S3 score as ground-truth (small cw09 sample):


Unnamed: 0_level_0,Approach,Top1000,Top1000,Relevant@Top1000,Relevant@Top1000,Irrelevant@Top1000,Irrelevant@Top1000
Unnamed: 0_level_1,Unnamed: 1_level_1,Precision,Recall,Precision,Recall,Precision,Recall
0,CopyCat,0.926,0.361,0.994,0.54,0.87,0.339
1,Url Classes,0.902,0.079,0.986,0.166,0.794,0.107
2,SimHash(1-grams),0.749,0.803,0.799,0.89,0.758,0.902
3,SimHash(3+5-grams),0.95,0.327,0.998,0.489,0.927,0.285
4,TextProfile,0.977,0.145,1.0,0.352,0.995,0.084
5,MD5,1.0,0.092,1.0,0.307,1.0,0.04


In [33]:
def f(v):
    return v

def row(name):
    r = df[df['Approach'] == name].iloc[0]
    return '& ' + f(r[('Top1000', 'Precision')]) + ' & ' + f(r[('Top1000', 'Recall')]) + ' & ' + \
           f(r[('Relevant@Top1000', 'Precision')]) + ' & ' + f(r[('Relevant@Top1000', 'Recall')]) +' & ' + \
           f(r[('Irrelevant@Top1000', 'Precision')]) + ' & ' + f(r[('Irrelevant@Top1000', 'Recall')]) + ' \\\\'

def table():
    return """
\\begin{table}
\\centering
\\small
\\setlength{\\tabcolsep}{3pt}%
\\caption{TBD. {\\color{red}Make table consume full width.}}
\\label{table-precision-recall-in-runs}
\\begin{tabular}{@{}lcccccc@{}}
\\toprule
{\\bfseries Method} & \\multicolumn{2}{c@{}}{\\bfseries Top~1000} & \\multicolumn{2}{c@{}}{\\bfseries Relevant@Top~1000} & \\multicolumn{2}{c@{}}{\\bfseries Irrelevant@Top~1000} \\\\

\\cmidrule(l){2-3}
\\cmidrule(l){4-5}
\\cmidrule(l){6-7}

& Prec. & Rec.  & Prec. & Rec. & Prec. & Rec. \\\\

\\midrule

Crawl """ +  row('SimHash(3+5-grams)') + """

Classes """ +  row('Url Classes') + """
\\midrule

\\resource """ +  row('CopyCat') + """
\\bottomrule
\\end{tabular}

\\end{table}
"""

print(table())


\begin{table}
\centering
\small
\setlength{\tabcolsep}{3pt}%
\caption{TBD. {\color{red}Make table consume full width.}}
\label{table-precision-recall-in-runs}
\begin{tabular}{@{}lcccccc@{}}
\toprule
{\bfseries Method} & \multicolumn{2}{c@{}}{\bfseries Top~1000} & \multicolumn{2}{c@{}}{\bfseries Relevant@Top~1000} & \multicolumn{2}{c@{}}{\bfseries Irrelevant@Top~1000} \\

\cmidrule(l){2-3}
\cmidrule(l){4-5}
\cmidrule(l){6-7}

& Prec. & Rec.  & Prec. & Rec. & Prec. & Rec. \\

\midrule

Crawl & 0.950 & 0.327 & 0.998 & 0.489 & 0.927 & 0.285 \\

Classes & 0.902 & 0.079 & 0.986 & 0.166 & 0.794 & 0.107 \\
\midrule

\resource & 0.926 & 0.361 & 0.994 & 0.540 & 0.870 & 0.339 \\
\bottomrule
\end{tabular}

\end{table}

