In [1]:
sc

In [16]:
ZERO_VALUE = {'falsePositive': 0, 'falseNegative': 0, 'truePositive': 0, 'trueNegative': 0}

def report_features(json_src):
    import json
    j = json.loads(json_src)
    all_features = ['1-gramms', '3-gramms', '5-gramms', '8-gramms', '1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']
    
    if 'S3' in j['featureNames']:
        for feature in all_features:
            if feature in j['featureNames']:
                yield (feature, {'falsePositive': 0, 'falseNegative': 0, 'truePositive': 1, 'trueNegative': 0})
            else:
                yield (feature, {'falsePositive': 0, 'falseNegative': 1, 'truePositive': 0, 'trueNegative': 0})
    else:
        for feature in j['featureNames']:
            yield (feature, {'falsePositive': 1, 'falseNegative': 0, 'truePositive': 0, 'trueNegative': 0})

def comb(i, j):
    return {
        'falsePositive': i['falsePositive']+j['falsePositive'],
        'falseNegative': i['falseNegative']+j['falseNegative'],
        'truePositive': i['truePositive']+j['truePositive'],
        'trueNegative': i['trueNegative']+j['trueNegative']
    }

raw_data = sc.textFile('cikm2020/canonical-link-graph/cw09-feature-set-evaluation')\
    .flatMap(lambda i: report_features(i))\
    .aggregateByKey(ZERO_VALUE, comb, comb)\
    .collect()

In [19]:
import pandas as pd
keys =  sorted(data.keys())

df = pd.DataFrame([
    [key, data[key]['falsePositive'], data[key]['falseNegative'], data[key]['truePositive']] for key in keys
], columns=['Features', 'falsePositive', 'falseNegative', 'truePositive'])

df['Precision'] = df['truePositive']/(df['truePositive']+ df['falsePositive'])
df['Recall'] = df['truePositive']/(df['truePositive']+ df['falseNegative'])
df['F1'] = 2/((1/df['Precision']) + (1/df['Recall']))

0.5113869326932142
0.9137016164956591


In [45]:
df.sort_values('F1', ascending=False)

Unnamed: 0,Features,falsePositive,falseNegative,truePositive,Precision,Recall,F1
6,3-gramms,125051,810754,2828167,0.957656,0.777199,0.858042
4,3-5-gramms,85574,980871,2658050,0.96881,0.73045,0.832912
8,5-gramms,47979,1196279,2442642,0.980736,0.671254,0.797006
5,3-8-gramms,107094,1199123,2439798,0.957951,0.670473,0.788837
7,5-8-gramms,53725,1398257,2240664,0.976584,0.61575,0.755283
0,1-3-gramms,2081490,222409,3416512,0.62141,0.938881,0.747847
9,8-gramms,21867,1587390,2051531,0.989454,0.563775,0.718283
2,1-8-gramms,2547757,407818,3231103,0.559125,0.887929,0.686171
1,1-5-gramms,3176819,314033,3324888,0.511387,0.913702,0.655756
3,1-gramms,4682450,87235,3551686,0.431337,0.976027,0.598277


In [66]:
def f(df, row, col):
    return '{:.2f}'.format(df[df.Features == row][col].values[0])

def feature_selection_latex_table_row(df, feature):
    return '\n' + feature + ' & ' +  \
            f(df, feature, 'Precision') + ' & ' + \
            f(df, feature, 'Recall') + ' & ' + \
            f(df, feature, 'F1') + \
            ' & 0 & 0 & 0 \\\\\n'

def feature_selection_latex_table(df):
    
    first = ""
    for l in ['1-gramms', '3-gramms', '5-gramms', '8-gramms']:
        first += feature_selection_latex_table_row(df, l)
    
    second = ""
    for l in ['1-3-gramms', '1-5-gramms', '1-8-gramms', '3-5-gramms', '3-8-gramms', '5-8-gramms']:
        second += feature_selection_latex_table_row(df, l)
    
    return """\\begin{table*}
\\centering
\\small
\\setlength{\\tabcolsep}{3pt}%
\\caption{TBD}
\\label{table-feature-sets-precision-recall}
\\begin{tabular}{@{}l@{\\hspace{2\\tabcolsep}}ccc@{\\hspace{3\\tabcolsep}}ccc@{}}

\\toprule
\\bfseries Feature & \\multicolumn{3}{c@{}}{\\bfseries ClueWeb09} & \\multicolumn{3}{c@{}}{\\bfseries ClueWeb12}\\\\

\\cmidrule(r){2-4}
\\cmidrule(r){5-7}

& \\Precision & \\Recall & \\Fscore & \\Precision & \\Recall & \\Fscore\\\\
\\midrule
""" + first + "\n\\midrule\n" + second + """

\\bottomrule

\\end{tabular}
\\end{table*} 
"""

print(feature_selection_latex_table(df))

\begin{table*}
\centering
\small
\setlength{\tabcolsep}{3pt}%
\caption{TBD}
\label{table-feature-sets-precision-recall}
\begin{tabular}{@{}l@{\hspace{2\tabcolsep}}ccc@{\hspace{3\tabcolsep}}ccc@{}}

\toprule
\bfseries Feature & \multicolumn{3}{c@{}}{\bfseries ClueWeb09} & \multicolumn{3}{c@{}}{\bfseries ClueWeb12}\\

\cmidrule(r){2-4}
\cmidrule(r){5-7}

& \Precision & \Recall & \Fscore & \Precision & \Recall & \Fscore\\
\midrule

1-gramms & 0.43 & 0.98 & 0.60 & 0 & 0 & 0 \\

3-gramms & 0.96 & 0.78 & 0.86 & 0 & 0 & 0 \\

5-gramms & 0.98 & 0.67 & 0.80 & 0 & 0 & 0 \\

8-gramms & 0.99 & 0.56 & 0.72 & 0 & 0 & 0 \\

\midrule

1-3-gramms & 0.62 & 0.94 & 0.75 & 0 & 0 & 0 \\

1-5-gramms & 0.51 & 0.91 & 0.66 & 0 & 0 & 0 \\

1-8-gramms & 0.56 & 0.89 & 0.69 & 0 & 0 & 0 \\

3-5-gramms & 0.97 & 0.73 & 0.83 & 0 & 0 & 0 \\

3-8-gramms & 0.96 & 0.67 & 0.79 & 0 & 0 & 0 \\

5-8-gramms & 0.98 & 0.62 & 0.76 & 0 & 0 & 0 \\


\bottomrule

\end{tabular}
\end{table*} 



'0.96'