In [1]:
sc

In [4]:
ZERO_VALUE = {'falsePositive': 0, 'falseNegative': 0, 'truePositive': 0, 'trueNegative': 0}

def report_features(json_src):
    import json
    j = json.loads(json_src)
    all_features = ['1-gramms', '3-gramms', '5-gramms', '8-gramms', '1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']
    
    if 'S3' in j['featureNames']:
        for feature in all_features:
            if feature in j['featureNames']:
                yield (feature, {'falsePositive': 0, 'falseNegative': 0, 'truePositive': 1, 'trueNegative': 0})
            else:
                yield (feature, {'falsePositive': 0, 'falseNegative': 1, 'truePositive': 0, 'trueNegative': 0})
    elif 'S3-negative' in j['featureNames']:
        for feature in j['featureNames']:
            yield (feature, {'falsePositive': 1, 'falseNegative': 0, 'truePositive': 0, 'trueNegative': 0})

def comb(i, j):
    return {
        'falsePositive': i['falsePositive']+j['falsePositive'],
        'falseNegative': i['falseNegative']+j['falseNegative'],
        'truePositive': i['truePositive']+j['truePositive'],
        'trueNegative': i['trueNegative']+j['trueNegative']
    }

def extract_raw_feature_selection_data(sc, collection):
    dir = 'cikm2020/canonical-link-graph/' + collection + '-feature-set-evaluation'
    if 'cw' in collection or '2015' in collection:
        dir += '-canonical-link-graph-edges'

    return sc.textFile(dir)\
        .flatMap(lambda i: report_features(i))\
        .aggregateByKey(ZERO_VALUE, comb, comb)\
        .collect()


In [None]:
raw_data_09 = extract_raw_feature_selection_data(sc, 'cw09')
raw_data_12 = extract_raw_feature_selection_data(sc, 'cw12')
raw_data_15 = extract_raw_feature_selection_data(sc, 'cc-2015-11')
raw_data_17 = extract_raw_feature_selection_data(sc, 'cc-2017-04')

In [6]:
import pandas as pd

def raw_feature_data_to_df(raw_data):
    data = {i[0]:i[1] for i in raw_data}
    keys =  sorted(data.keys())

    ret = pd.DataFrame([
        [key, data[key]['falsePositive'], data[key]['falseNegative'], data[key]['truePositive']] for key in keys
    ], columns=['Features', 'falsePositive', 'falseNegative', 'truePositive'])

    ret['Precision'] = ret['truePositive']/(ret['truePositive']+ ret['falsePositive'])
    ret['Recall'] = ret['truePositive']/(ret['truePositive']+ ret['falseNegative'])
    ret['F1'] = 2/((1/ret['Precision']) + (1/ret['Recall']))
    
    return ret

df_09 = raw_feature_data_to_df(raw_data_09)
df_12 = raw_feature_data_to_df(raw_data_12)
df_15 = raw_feature_data_to_df(raw_data_15)
df_17 = raw_feature_data_to_df(raw_data_17)

In [16]:
def f(df, row, col):
    return '{:.2f}'.format(df[df.Features == row][col].values[0])

def feature_selection_latex_table_row(df_cw09, df_cw12, df_cc15, df_cc17, feature):
    return '\n' + feature + ' & ' +  \
            f(df_cw09, feature, 'Precision') + ' & ' + \
            f(df_cw09, feature, 'Recall') + ' & ' + \
            f(df_cw09, feature, 'F1') + ' & ' + \
            f(df_cw12, feature, 'Precision') + ' & ' + \
            f(df_cw12, feature, 'Recall') + ' & ' + \
            f(df_cw12, feature, 'F1') + ' & ' + \
            f(df_cc15, feature, 'Precision') + ' & ' + \
            f(df_cc15, feature, 'Recall') + ' & ' + \
            f(df_cc15, feature, 'F1') + ' & ' + \
            f(df_cc17, feature, 'Precision') + ' & ' + \
            f(df_cc17, feature, 'Recall') + ' & ' + \
            f(df_cc17, feature, 'F1') + \
            '\\\\\n'

def feature_selection_latex_table(df_09, df_12, df_15, df_17):
    
    first = ""
    for l in ['1-gramms', '3-gramms', '5-gramms', '8-gramms']:
        first += feature_selection_latex_table_row(df_09, df_12, df_15, df_17, l)
    
    second = ""
    for l in ['1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']:
        second += feature_selection_latex_table_row(df_09, df_12, df_15, df_17, l)
    
    return """
\\begin{tabular}{@{}l@{\\hspace{2\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{}}

\\multicolumn{4}{@{}l@{}}{(a)~\\emph{TBD}} \\\\
\\toprule
\\bfseries Feature & \\multicolumn{3}{c@{}}{\\bfseries ClueWeb09} & \\multicolumn{3}{c@{}}{\\bfseries ClueWeb12} & \\multicolumn{3}{c@{}}{\\bfseries CC 2015} & \\multicolumn{3}{c@{}}{\\bfseries CC 2017}\\\\

\\cmidrule(r){2-4}
\\cmidrule(r){5-7}
\\cmidrule(r){8-10}
\\cmidrule(r){11-13}

& \\Precision & \\Recall & \\Fscore & \\Precision & \\Recall & \\Fscore& \\Precision & \\Recall & \\Fscore & \\Precision & \\Recall & \\Fscore\\\\
\\midrule
""" + first + "\n\\midrule\n" + second + """

\\bottomrule

\\end{tabular}
"""

print(feature_selection_latex_table(df_09, df_12, df_15, df_17))


\begin{tabular}{@{}l@{\hspace{2\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{}}

\multicolumn{4}{@{}l@{}}{(a)~\emph{TBD}} \\
\toprule
\bfseries Feature & \multicolumn{3}{c@{}}{\bfseries ClueWeb09} & \multicolumn{3}{c@{}}{\bfseries ClueWeb12} & \multicolumn{3}{c@{}}{\bfseries CC 2015} & \multicolumn{3}{c@{}}{\bfseries CC 2017}\\

\cmidrule(r){2-4}
\cmidrule(r){5-7}
\cmidrule(r){8-10}
\cmidrule(r){11-13}

& \Precision & \Recall & \Fscore & \Precision & \Recall & \Fscore& \Precision & \Recall & \Fscore & \Precision & \Recall & \Fscore\\
\midrule

1-gramms & 1.00 & 0.98 & 0.99 & 0.95 & 0.89 & 0.92 & 0.94 & 0.82 & 0.88 & 0.96 & 0.88 & 0.92\\

3-gramms & 1.00 & 0.78 & 0.87 & 0.99 & 0.71 & 0.83 & 0.99 & 0.60 & 0.75 & 0.99 & 0.69 & 0.82\\

5-gramms & 1.00 & 0.67 & 0.80 & 1.00 & 0.65 & 0.79 & 1.00 & 0.53 & 0.69 & 1.00 & 0.63 & 0.77\\

8-gramms & 1.00 & 0.56 & 0.72 & 1.00 & 0.60 & 0.75 & 1.00 & 0.47 & 0.64 & 1.00 & 0.57 & 0.73\\

\midrule

1-3-gr