In [1]:
sc

In [2]:
def map_to_precision_recall_counter(src, threshold):
    import json
    parsed = json.loads(src)
    truePositive = float(parsed['s3Score']) >= threshold
    
    return (parsed['candidate']['featureName'] , {
        'truePositiveCount': 1 if truePositive else 0,
        'falsePositiveCount': 0 if truePositive else 1
    })

def comb(i, j):
    return {
        'truePositiveCount': i['truePositiveCount']+j['truePositiveCount'],
        'falsePositiveCount': i['falsePositiveCount']+j['falsePositiveCount']
    }

def extract_raw_precision_experiment_results(sc, collection, threshold):
    zeroValue = {'truePositiveCount': 0, 'falsePositiveCount': 0}
    return sc.textFile('cikm2020/canonical-link-graph/feature-set-precision-experiments/' + collection + '-*-gramms-raw-data.jsonl')\
        .map(lambda i: map_to_precision_recall_counter(i, threshold))\
        .aggregateByKey(zeroValue, comb, comb)\
        .collect()


In [3]:
threshold = 0.8
raw_data_cw_09 = extract_raw_precision_experiment_results(sc, 'cw09', threshold)
raw_data_cw_12 = extract_raw_precision_experiment_results(sc, 'cw12', threshold)
raw_data_cc_15 = extract_raw_precision_experiment_results(sc, 'cc-2015-11', threshold)
raw_data_cc_17 = extract_raw_precision_experiment_results(sc, 'cc-2017-04', threshold)

In [4]:
def raw_precision_data_to_df(raw_data, label):
    import pandas as pd

    ret = pd.DataFrame([[i[0], i[1]['truePositiveCount'], i[1]['falsePositiveCount'], label] for i in raw_data], columns=['feature', 'truePositiveCount', 'falsePositiveCount', 'label'])
    ret['examples'] = ret['truePositiveCount'] + ret['falsePositiveCount']
    ret['precision'] = ret['truePositiveCount']/ret['examples']

    return ret

In [5]:
raw_precision_data_to_df(raw_data_cw_09, 'cw09').sort_values('precision', ascending=False)

Unnamed: 0,feature,truePositiveCount,falsePositiveCount,label,examples,precision
1,5-8-gramms,49972,28,cw09,50000,0.99944
6,8-gramms,49944,56,cw09,50000,0.99888
7,5-gramms,49781,219,cw09,50000,0.99562
3,3-8-gramms,49599,401,cw09,50000,0.99198
2,3-5-gramms,49564,436,cw09,50000,0.99128
8,3-gramms,48766,1234,cw09,50000,0.97532
5,1-3-gramms,40816,9184,cw09,50000,0.81632
0,1-8-gramms,40566,9434,cw09,50000,0.81132
9,1-5-gramms,34428,15572,cw09,50000,0.68856
4,1-gramms,30162,19838,cw09,50000,0.60324


In [6]:
raw_precision_data_to_df(raw_data_cw_12, 'cw12').sort_values('precision', ascending=False)

Unnamed: 0,feature,truePositiveCount,falsePositiveCount,label,examples,precision
2,3-5-gramms,48613,1387,cw12,50000,0.97226
1,5-8-gramms,48572,1428,cw12,50000,0.97144
3,3-8-gramms,48400,1600,cw12,50000,0.968
7,5-gramms,45541,4459,cw12,50000,0.91082
8,3-gramms,44748,5252,cw12,50000,0.89496
6,8-gramms,44424,5576,cw12,50000,0.88848
5,1-3-gramms,22479,27521,cw12,50000,0.44958
0,1-8-gramms,22022,27978,cw12,50000,0.44044
9,1-5-gramms,21800,28200,cw12,50000,0.436
4,1-gramms,17439,32561,cw12,50000,0.34878


In [11]:
raw_precision_data_to_df(raw_data_cc_15, 'cc-2015-11').sort_values('precision', ascending=False)

Unnamed: 0,feature,truePositiveCount,falsePositiveCount,label,examples,precision
2,3-5-gramms,38570,11430,cc-2015-11,50000,0.7714
3,3-8-gramms,37336,12664,cc-2015-11,50000,0.74672
5,1-3-gramms,37321,12679,cc-2015-11,50000,0.74642
8,3-gramms,36683,13317,cc-2015-11,50000,0.73366
9,1-5-gramms,36595,13405,cc-2015-11,50000,0.7319
0,1-8-gramms,35960,14040,cc-2015-11,50000,0.7192
1,5-8-gramms,33075,16925,cc-2015-11,50000,0.6615
7,5-gramms,33061,16939,cc-2015-11,50000,0.66122
4,1-gramms,32559,17441,cc-2015-11,50000,0.65118
6,8-gramms,27484,22516,cc-2015-11,50000,0.54968


In [7]:
raw_precision_data_to_df(raw_data_cc_17, 'cc-2017-04').sort_values('precision', ascending=False)

Unnamed: 0,feature,truePositiveCount,falsePositiveCount,label,examples,precision
2,3-5-gramms,47322,2678,cc-2017-04,50000,0.94644
3,3-8-gramms,46959,3041,cc-2017-04,50000,0.93918
8,3-gramms,42997,7003,cc-2017-04,50000,0.85994
7,5-gramms,41788,8212,cc-2017-04,50000,0.83576
1,5-8-gramms,41396,8604,cc-2017-04,50000,0.82792
6,8-gramms,33268,16732,cc-2017-04,50000,0.66536
9,1-5-gramms,33239,16761,cc-2017-04,50000,0.66478
5,1-3-gramms,32844,17156,cc-2017-04,50000,0.65688
0,1-8-gramms,31490,18510,cc-2017-04,50000,0.6298
4,1-gramms,29131,20869,cc-2017-04,50000,0.58262


In [9]:
def line_table_feature_sets_precision(feature, df):
    def c(label):
        return '{:.3f}'.format(df[(df['feature'] == feature) & (df['label'] == label)].values[0][5])
    return feature + ' & ' + c('cw09') +' & ' + c('cw12') +' & ' + c('cc15') + ' & ' + c('cc17') +'\\\\';

def table_feature_sets_precision(df):
    features = ['1-gramms', '3-gramms', '5-gramms', '8-gramms']
    combined_features = ['1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']
    
    content = '\n\n'.join(line_table_feature_sets_precision(i, df) for i in features)
    content += '\n\n\\midrule\n\n'
    content += '\n\n'.join(line_table_feature_sets_precision(i, df) for i in combined_features)
    
    return """
\\begin{tabular}{@{}l@{\\hspace{2\\tabcolsep}}c@{\\hspace{3\\tabcolsep}}c@{\\hspace{3\\tabcolsep}}c@{\\hspace{3\\tabcolsep}}c@{}}

\\multicolumn{4}{@{}l@{}}{(b)~\\emph{TBD}} \\\\
\\toprule
\\bfseries Feature & \\multicolumn{4}{c@{}}{\\bfseries Precision}\\\\

\\cmidrule(r){2-5}
& CW09 & CW12 & CC15 & CC 17\\\\
\\midrule

""" + content + """

\\bottomrule

\\end{tabular}
 
"""

df = raw_precision_data_to_df(raw_data_cw_09, 'cw09')
df = df.append(raw_precision_data_to_df(raw_data_cw_12, 'cw12'))
df = df.append(raw_precision_data_to_df(raw_data_cc_15, 'cc15'))
df = df.append(raw_precision_data_to_df(raw_data_cc_17, 'cc17'))

print(table_feature_sets_precision(df))


\begin{tabular}{@{}l@{\hspace{2\tabcolsep}}c@{\hspace{3\tabcolsep}}c@{\hspace{3\tabcolsep}}c@{\hspace{3\tabcolsep}}c@{}}

\multicolumn{4}{@{}l@{}}{(b)~\emph{TBD}} \\
\toprule
\bfseries Feature & \multicolumn{4}{c@{}}{\bfseries Precision}\\

\cmidrule(r){2-5}
& CW09 & CW12 & CC15 & CC 17\\
\midrule

1-gramms & 0.603 & 0.349 & 0.651 & 0.583\\

3-gramms & 0.975 & 0.895 & 0.734 & 0.860\\

5-gramms & 0.996 & 0.911 & 0.661 & 0.836\\

8-gramms & 0.999 & 0.888 & 0.550 & 0.665\\

\midrule

1-3-gramms & 0.816 & 0.450 & 0.746 & 0.657\\

1-5-gramms & 0.689 & 0.436 & 0.732 & 0.665\\

1-8-gramms & 0.811 & 0.440 & 0.719 & 0.630\\

3-5-gramms & 0.991 & 0.972 & 0.771 & 0.946\\

3-8-gramms & 0.992 & 0.968 & 0.747 & 0.939\\

5-8-gramms & 0.999 & 0.971 & 0.661 & 0.828\\

\bottomrule

\end{tabular}
 

