## Measure the model performance

In [152]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

In [153]:
ansPath = 'dev_label'
predictPath = 'self_out'

ans_df = pd.read_csv( ansPath + '.csv', encoding='utf-8')
predict_df = pd.read_csv( predictPath + '.csv', encoding='utf-8')

In [154]:
ans_df.sort_values(by=['IMG_ID'], inplace = True)
ans_df.head()

Unnamed: 0,IMG_ID,D1,D2,D3,D4,D5
1791,00002.jpg,0,0,0,1,0
1843,00003.jpg,0,0,1,0,0
3151,00004.jpg,0,0,1,0,0
89,00022.jpg,0,0,1,1,0
3631,00030.jpg,0,0,0,1,0


In [155]:
predict_df.sort_values(by=['IMG_ID'], inplace = True)
predict_df.head()

Unnamed: 0,IMG_ID,D1,D2,D3,D4,D5
1025,00002.jpg,0,0,0,1,0
1279,00003.jpg,0,0,1,1,0
395,00004.jpg,0,0,0,0,0
3013,00022.jpg,0,0,0,1,0
870,00030.jpg,0,0,0,1,0


In [156]:
# please pass two dataframes
def macro_f1(predict, answer):
    prec = []
    rec = []
    for i in range(1,6):
        t1 = predict.iloc[:, i]
        t2 = answer.iloc[:, i]
        prec.append(precision_score(t1, t2))
        rec.append(recall_score(t1, t2))
    prec_ma = sum(prec)/len(prec)
    rec_ma = sum(rec)/len(rec)
    f1_ma = (2 * prec_ma * rec_ma) / (prec_ma + rec_ma)
    return f1_ma

In [159]:
macro_f1(predict_df, ans_df)

0.3124291528283567

In [160]:
# test all for 1
tmpdf = pd.DataFrame(columns = ['IMG_ID', 'D1', 'D2', 'D3', 'D4', 'D5'])
ll = [0, 0, 0, 1, 1, 0]
for i in range(ans_df.shape[0]):
    tmpdf.loc[i] = ll

In [161]:
tmpdf = tmpdf.astype('int32')
macro_f1(tmpdf, ans_df)

0.26772706732941715

## Transform the original file to the labeled file

In [64]:
import pandas as pd

inputPath = 'dev'

input_df = pd.read_csv( inputPath + '.csv', encoding='utf-8', header=None)

In [65]:
input_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,129,130,131,132,133,134,135
0,26519.jpg,559,772,233,413,不良-機械傷害,848.0,539.0,102.0,231.0,...,,,,,,,,,,
1,39995.jpg,376,481,103,88,不良-機械傷害,615.0,612.0,46.0,32.0,...,,,,,,,,,,
2,40837.jpg,854,248,132,238,不良-機械傷害,743.0,587.0,32.0,26.0,...,,,,,,,,,,
3,09242.jpg,504,472,60,55,不良-炭疽病,646.0,382.0,318.0,100.0,...,,,,,,,,,,
4,22304.jpg,723,693,399,108,不良-機械傷害,779.0,598.0,27.0,26.0,...,,,,,,,,,,


In [66]:
def transform(indf):
    output_df = pd.DataFrame(columns = ['IMG_ID', 'D1', 'D2', 'D3', 'D4', 'D5'])
    # each row in dataframe
    for i in range(indf.shape[0]):
        case = indf.iloc[i].isnull()
        ll = [indf.iloc[i][0], 0, 0, 0, 0, 0]
        # each element in a row
        for j, k in enumerate(case):
            # if it is not NaN
            if k:
                break
            if indf.iloc[i][j] == '不良-乳汁吸附':
                ll[1] = 1
            if indf.iloc[i][j] == '不良-機械傷害':
                ll[2] = 1
            if indf.iloc[i][j] == '不良-炭疽病':
                ll[3] = 1
            if indf.iloc[i][j] == '不良-著色不佳':
                ll[4] = 1
            if indf.iloc[i][j] == '不良-黑斑病':
                ll[5] = 1
        output_df.loc[i] = ll
    return output_df

In [67]:
trans_df = transform(input_df)
trans_df.head()

Unnamed: 0,IMG_ID,D1,D2,D3,D4,D5
0,26519.jpg,0,1,1,0,0
1,39995.jpg,0,1,1,0,0
2,40837.jpg,0,1,1,0,0
3,09242.jpg,1,1,1,0,0
4,22304.jpg,0,1,1,0,0


In [68]:
trans_df.to_csv('dev_label.csv', index=False)