## Set up

In [1]:
import pandas as pd
import ast
import syllogism as sy

In [2]:
# Import des bases
df_prediction = pd.read_csv("../data/old/df2016_trained_simple.csv")
df_union = pd.read_csv("../data/old/df2016_trained_union.csv")
df_few_shot = pd.read_csv("../data/old/df2016_MNLI.csv")

## Creation du dataframe chelma

In [27]:
# On renome certaines colonnes
df_prediction = df_prediction.rename(columns={"choice_pred": "choice_pred_init"})
df_few_shot = df_few_shot.rename(columns={"choice_union_pred": "choice_pred"})

df_prediction.columns=Index(['id_seq', 'sentenced', 'choice_str', 'choice_pred_init'], dtype='object')
df_union.columns=Index(['id_seq', 'sentenced', 'choice_str', 'choice_union_pred'], dtype='object')
df_few_shot.columns=Index(['id_seq', 'sentenced', 'choice_str', 'choice_pred'], dtype='object')


In [28]:
def calcul_chelma(choice_pred_init,choice_union_pred):
    """ calcul the ratio between the simple prediction 
    (conclusion is the next sentence of the conclusion) and the union """
    
    choice_pred_init = ast.literal_eval(choice_pred_init)
    choice_union_pred = ast.literal_eval(choice_union_pred)
    choice_pred = []
    for i in range(len(choice_pred_init)):
        choice_pred.append(choice_pred_init[i]/choice_union_pred[i])
    return choice_pred

In [29]:
df_chelma = pd.merge(df_prediction, df_union, on=["id_seq", "sentenced", "choice_str"])
df_chelma['choice_pred'] = df_chelma[["choice_pred_init","choice_union_pred"]].apply(lambda x: calcul_chelma(x[0],x[1]), axis=1)
df_chelma = df_chelma.drop(["choice_pred_init","choice_union_pred"], axis=1)

Unnamed: 0,id_seq,sentenced,choice_str,choice_pred
0,1_0,Some models are managers and All models are cl...,"['All managers are clerks', 'All clerks are ma...","[1.000000119209659, 1.0, 1.0000001192096448, 1..."
1,1_1,No divers are carpenters and All linguists are...,"['All divers are linguists', 'All linguists ar...","[1.0000001192096306, 1.0, 0.9999997615807956, ..."
2,1_2,All therapists are climbers and Some skaters a...,"['All climbers are skaters', 'All skaters are ...","[1.0, 1.0000002384193465, 0.9999998807903978, ..."
3,1_3,All bankers are golfers and All golfers are te...,"['All bankers are teachers', 'All teachers are...","[1.0, 0.9999998807903978, 0.9999997615807956, ..."
4,1_4,Some boxers are not opticians and All boxers a...,"['All opticians are actuaries', 'All actuaries...","[0.999999880790412, 1.0, 0.999999880790412, 1...."


## fusion des dataframes

In [30]:
# We rename some column and drop other one
df_prediction = df_prediction.rename(columns={"choice_pred_init": "choice_pred"})
df_prediction = df_prediction.drop(["choice_str","sentenced"], axis=1)
df_chelma = df_chelma.drop(["choice_str","sentenced"], axis=1)
df_few_shot = df_few_shot.drop(["choice_str","sentenced"], axis=1)

In [31]:
# We import the dataframe with the orginal choices and syllogism form.
df_choice_forme = pd.read_csv("../data/intermediate/df_choice_forme.csv")

In [33]:
# merging of the dataframe
df_final = pd.merge(df_prediction, df_chelma, on=["id_seq"], suffixes=('_prediction', '_chelma'))
df_final = pd.merge(df_final, df_few_shot, on=["id_seq"])
df_final = df_final.rename(columns={"best": "best_few_shot"})
df_final = pd.merge(df_final, df_choice_forme, on=["id_seq"])
df_final = df_final.rename(columns={"choice_pred": "choice_pred_few_shot"})

## Calcul des taux de bonnes réponses de chaque modèle

In [35]:
import ast

def select_best(pred_list,choices):
    """ determine the answer of the model as the conclusion with the 
    highest probability
    """
    if isinstance(pred_list,str):
        pred_list = ast.literal_eval(pred_list)
    max_value = max(pred_list)
    max_index = pred_list.index(max_value)
    max_str = choices[max_index]
    if max_value <= 1:
        max_value *= 100
    return (max_index,max_value,max_str)

In [36]:
# We use the class Syllogisme to get the list of choice at the right format
df_final['choice_list'] = df_final[["task","choices"]].apply(lambda x: sy.Syllogism(x[0]).choice_to_choice_list(x[1]), axis=1)

In [38]:
# We get the answer of each mode

df_final['best_prediction'] = df_final[["choice_pred_prediction","choice_list"]].apply(lambda x: select_best(x[0],x[1]), axis=1)
df_final['best_chelma'] = df_final[["choice_pred_chelma","choice_list"]].apply(lambda x: select_best(x[0],x[1]), axis=1)
df_final['best_few_shot'] = df_final[["choice_pred_few_shot","choice_list"]].apply(lambda x: select_best(x[0],x[1]), axis=1)


In [40]:
# On détermine si les réponses sont valides ou non
df_final['prediction_result'] = df_final[["task","best_prediction"]].apply(lambda x: sy.Syllogism(x[0]).evaluate_conclusion(x[1][2]), axis=1)
df_final['chelma_result'] = df_final[["task","best_chelma"]].apply(lambda x: sy.Syllogism(x[0]).evaluate_conclusion(x[1][2]), axis=1)
df_final['few_shot_result'] = df_final[["task","best_few_shot"]].apply(lambda x: sy.Syllogism(x[0]).evaluate_conclusion(x[1][2]), axis=1)



In [41]:
# On distingue les syllogismes ayant des conclusions valides des autres
df_final['has_conclusion'] = df_final.task.apply(lambda x : sy.Syllogism(x).has_conclusion)

In [43]:
# On crée une indicatrice de succès pour chaque modèle
df_to_analyse = df_final[["id_seq","task_form","human_response", "prediction_result","chelma_result", "few_shot_result","has_conclusion"]]

df_to_analyse["succes_human"]= df_to_analyse.human_response.apply(lambda x : 1 if ast.literal_eval(x)[1]==True else 0 )
df_to_analyse["succes_prediction"]= df_to_analyse.prediction_result.apply(lambda x : 1 if x[1]==True else 0 )
df_to_analyse["succes_chelma"]= df_to_analyse.chelma_result.apply(lambda x : 1 if x[1]==True else 0 )
df_to_analyse["succes_few_shot"]= df_to_analyse.few_shot_result.apply(lambda x : 1 if x[1]==True else 0 )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_analyse["succes_human"]= df_to_analyse.human_response.apply(lambda x : 1 if ast.literal_eval(x)[1]==True else 0 )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_analyse["succes_prediction"]= df_to_analyse.prediction_result.apply(lambda x : 1 if x[1]==True else 0 )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

In [44]:
# On sépare les syllogisme qui ont une conclusion valide des autres
df_valid= df_to_analyse[df_to_analyse['has_conclusion'] == True]
df_unvalid = df_to_analyse[df_to_analyse['has_conclusion'] == False]

In [52]:
# On calcul le taux de succès de chaque modèle pour chaque
df_result_human = df_valid[['task_form', 'succes_human']].groupby(['task_form']).mean().merge(df_valid[['task_form', 'succes_human']].groupby(['task_form']).count(), on="task_form")
df_result_human = df_result_human.drop(["succes_human_y"], axis=1)

df_result_prediction = df_valid[['task_form', 'succes_prediction']].groupby(['task_form']).mean().merge(df_valid[['task_form', 'succes_prediction']].groupby(['task_form']).count(), on="task_form")
df_result_prediction = df_result_prediction.drop(["succes_prediction_y"], axis=1)

df_result_chelma = df_valid[['task_form', 'succes_chelma']].groupby(['task_form']).mean().merge(df_valid[['task_form', 'succes_chelma']].groupby(['task_form']).count(), on="task_form")
df_result_chelma = df_result_chelma.drop(["succes_chelma_y"], axis=1)

df_result_few_shot = df_valid[['task_form', 'succes_few_shot']].groupby(['task_form']).mean().merge(df_valid[['task_form', 'succes_few_shot']].groupby(['task_form']).count(), on="task_form")
df_result_few_shot = df_result_few_shot.drop(["succes_few_shot_y"], axis=1)

# Enfin on fusionne les résultats des différents modèles dans une meme table
df_result = pd.merge(df_result_human, df_result_prediction, on=["task_form"])
df_result = pd.merge(df_result, df_result_chelma, on=["task_form"])
df_result = pd.merge(df_result, df_result_few_shot, on=["task_form"])


In [57]:
df_result['Syllogism'] = df_result.index
df_result = df_result.rename(columns={"succes_human_x": "Human", "succes_prediction_x":"Bert_simple", "succes_chelma_x":"Bert_Chelma", "succes_few_shot_x":"Bart_MNLI" })
df_result = df_result[['Syllogism','Human','Bert_simple','Bert_Chelma','Bart_MNLI']]

df_result

Unnamed: 0_level_0,Syllogism,Human,Bert_simple,Bert_Chelma,Bart_MNLI
task_form,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA1,AA1,0.81,0.81,0.52,0.86
AA2,AA2,0.68,0.44,0.19,0.64
AA4,AA4,0.28,0.17,0.0,0.01
AE1,AE1,0.88,0.06,0.07,1.0
AE2,AE2,0.09,0.01,0.0,0.0
AE3,AE3,0.83,0.05,0.09,1.0
AE4,AE4,0.18,0.01,0.0,0.0
AI2,AI2,0.81,0.74,0.15,0.24
AI4,AI4,0.81,0.72,0.14,0.68
AO3,AO3,0.35,0.13,0.08,0.91


In [None]:
df_result.to_csv("../data/results/df_result_16_12_2021.csv" ,index=False)