In [1]:
sc

In [1]:
ZERO_VALUE = {'falsePositive': 0, 'falseNegative': 0, 'truePositive': 0, 'trueNegative': 0}

def report_features(json_src):
    import json
    j = json.loads(json_src)
    all_features = ['1-gramms', '3-gramms', '5-gramms', '8-gramms', '1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']
    
    if 'S3' in j['featureNames']:
        for feature in all_features:
            if feature in j['featureNames']:
                yield (feature, {'falsePositive': 0, 'falseNegative': 0, 'truePositive': 1, 'trueNegative': 0})
            else:
                yield (feature, {'falsePositive': 0, 'falseNegative': 1, 'truePositive': 0, 'trueNegative': 0})
    else:
        for feature in j['featureNames']:
            yield (feature, {'falsePositive': 1, 'falseNegative': 0, 'truePositive': 0, 'trueNegative': 0})

def comb(i, j):
    return {
        'falsePositive': i['falsePositive']+j['falsePositive'],
        'falseNegative': i['falseNegative']+j['falseNegative'],
        'truePositive': i['truePositive']+j['truePositive'],
        'trueNegative': i['trueNegative']+j['trueNegative']
    }

def extract_raw_feature_selection_data(sc, collection):
    return sc.textFile('cikm2020/canonical-link-graph/' + collection + '-feature-set-evaluation')\
        .flatMap(lambda i: report_features(i))\
        .aggregateByKey(ZERO_VALUE, comb, comb)\
        .collect()


raw_data_09 = extract_raw_feature_selection_data(sc, 'cw09')
raw_data_12 = extract_raw_feature_selection_data(sc, 'cw12')

In [7]:
import pandas as pd

def raw_feature_data_to_df(raw_data):
    data = {i[0]:i[1] for i in raw_data}
    keys =  sorted(data.keys())

    ret = pd.DataFrame([
        [key, data[key]['falsePositive'], data[key]['falseNegative'], data[key]['truePositive']] for key in keys
    ], columns=['Features', 'falsePositive', 'falseNegative', 'truePositive'])

    ret['Precision'] = ret['truePositive']/(ret['truePositive']+ ret['falsePositive'])
    ret['Recall'] = ret['truePositive']/(ret['truePositive']+ ret['falseNegative'])
    ret['F1'] = 2/((1/ret['Precision']) + (1/ret['Recall']))
    
    return ret

df_09 = raw_feature_data_to_df(raw_data_09)
df_12 = raw_feature_data_to_df(raw_data_12)

In [11]:
df_12.sort_values('F1', ascending=False)

Unnamed: 0,Features,falsePositive,falseNegative,truePositive,Precision,Recall,F1
4,3-5-gramms,389172849,16241578,34739011,0.081949,0.681416,0.146303
5,3-8-gramms,394612053,17823855,33156734,0.077511,0.65038,0.138514
7,5-8-gramms,388157910,19193310,31787279,0.075694,0.623517,0.134999
8,5-gramms,417759192,17867001,33113588,0.073443,0.649533,0.131965
6,3-gramms,474840928,14684198,36296391,0.071011,0.711965,0.129142
9,8-gramms,406962227,20454793,30525796,0.069775,0.598773,0.124986
2,1-8-gramms,1154633595,10858451,40122138,0.033582,0.787008,0.064415
1,1-5-gramms,1212608111,9517002,41463587,0.033063,0.813321,0.063543
0,1-3-gramms,1247801577,8436621,42543968,0.032971,0.834513,0.063436
3,1-gramms,1894678162,5570164,45410425,0.023406,0.89074,0.045614


In [12]:
df_09.sort_values('F1', ascending=False)

Unnamed: 0,Features,falsePositive,falseNegative,truePositive,Precision,Recall,F1
6,3-gramms,125051,810754,2828167,0.957656,0.777199,0.858042
4,3-5-gramms,85574,980871,2658050,0.96881,0.73045,0.832912
8,5-gramms,47979,1196279,2442642,0.980736,0.671254,0.797006
5,3-8-gramms,107094,1199123,2439798,0.957951,0.670473,0.788837
7,5-8-gramms,53725,1398257,2240664,0.976584,0.61575,0.755283
0,1-3-gramms,2081490,222409,3416512,0.62141,0.938881,0.747847
9,8-gramms,21867,1587390,2051531,0.989454,0.563775,0.718283
2,1-8-gramms,2547757,407818,3231103,0.559125,0.887929,0.686171
1,1-5-gramms,3176819,314033,3324888,0.511387,0.913702,0.655756
3,1-gramms,4682450,87235,3551686,0.431337,0.976027,0.598277


In [17]:
def f(df, row, col):
    return '{:.2f}'.format(df[df.Features == row][col].values[0])

def feature_selection_latex_table_row(df_cw09, df_cw12, feature):
    return '\n' + feature + ' & ' +  \
            f(df_cw09, feature, 'Precision') + ' & ' + \
            f(df_cw09, feature, 'Recall') + ' & ' + \
            f(df_cw09, feature, 'F1') + ' & ' + \
            f(df_cw12, feature, 'Precision') + ' & ' + \
            f(df_cw12, feature, 'Recall') + ' & ' + \
            f(df_cw12, feature, 'F1') + \
            ' & ??? & ??? & ??? & ??? & ??? & ???\\\\\n'

def feature_selection_latex_table(df_09, df_12):
    
    first = ""
    for l in ['1-gramms', '3-gramms', '5-gramms', '8-gramms']:
        first += feature_selection_latex_table_row(df_09, df_12, l)
    
    second = ""
    for l in ['1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']:
        second += feature_selection_latex_table_row(df_09, df_12, l)
    
    return """\\begin{table*}
\\centering
\\small
\\setlength{\\tabcolsep}{3pt}%
\\caption{TBD}
\\label{table-feature-sets-precision-recall}
\\begin{tabular}{@{}l@{\\hspace{2\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{}}

\\toprule
\\bfseries Feature & \\multicolumn{3}{c@{}}{\\bfseries ClueWeb09} & \\multicolumn{3}{c@{}}{\\bfseries ClueWeb12} & \\multicolumn{3}{c@{}}{\\bfseries CC 2015} & \\multicolumn{3}{c@{}}{\\bfseries CC 2017}\\\\

\\cmidrule(r){2-4}
\\cmidrule(r){5-7}
\\cmidrule(r){8-10}
\\cmidrule(r){11-13}

& \\Precision & \\Recall & \\Fscore & \\Precision & \\Recall & \\Fscore& \\Precision & \\Recall & \\Fscore & \\Precision & \\Recall & \\Fscore\\\\
\\midrule
""" + first + "\n\\midrule\n" + second + """

\\bottomrule

\\end{tabular}
\\end{table*} 
"""

print(feature_selection_latex_table(df_09, df_12))

\begin{table*}
\centering
\small
\setlength{\tabcolsep}{3pt}%
\caption{TBD}
\label{table-feature-sets-precision-recall}
\begin{tabular}{@{}l@{\hspace{2\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{}}

\toprule
\bfseries Feature & \multicolumn{3}{c@{}}{\bfseries ClueWeb09} & \multicolumn{3}{c@{}}{\bfseries ClueWeb12} & \multicolumn{3}{c@{}}{\bfseries CC 2015} & \multicolumn{3}{c@{}}{\bfseries CC 2017}\\

\cmidrule(r){2-4}
\cmidrule(r){5-7}
\cmidrule(r){8-10}
\cmidrule(r){11-13}

& \Precision & \Recall & \Fscore & \Precision & \Recall & \Fscore& \Precision & \Recall & \Fscore & \Precision & \Recall & \Fscore\\
\midrule

1-gramms & 0.43 & 0.98 & 0.60 & 0.02 & 0.89 & 0.05 & ??? & ??? & ??? & ??? & ??? & ???\\

3-gramms & 0.96 & 0.78 & 0.86 & 0.07 & 0.71 & 0.13 & ??? & ??? & ??? & ??? & ??? & ???\\

5-gramms & 0.98 & 0.67 & 0.80 & 0.07 & 0.65 & 0.13 & ??? & ??? & ??? & ??? & ??? & ???\\

8-gramms & 0.99 & 0.56 & 0.72 & 0.07 & 0.60 & 0.12 & ??

'0.96'