# Secondo progetto di Social Computing

In [70]:
import pandas as pd

# importaimo il file .csv usando la libreria Pandas
df = pd.read_csv("group_5-Palma-Sacchet-Sagliocca.csv")

In [71]:
# %load utils.py
import json
import os


def serealize_json(folder, filename, data):
    if not os.path.exists(folder):
        os.makedirs(folder,exist_ok=True)
    with open(f"{folder}/{filename}.json", "w", encoding="utf-8") as f:
        json.dump(data,f,ensure_ascii=False,indent=4)
        f.close()
    print(f"Data serialized to path: {folder}/{filename}.json")

def read_json(path):
    try:
        if os.path.exists(path):
            with open(path, "r", encoding="utf-8") as file:
                data = json.load(file)
            return data
    except ValueError:
        print("Path not found, check the correctness of the path")

In [72]:
# Funziona che permette di crere un elemento di un HIT (document)
def document_factory(row,human:bool):
    document = {
        'id': f'{row['id']}',
        'statement': row['statement'],
        'explanation': row['explanation_human'] if human else row['explanation_model'],
        'label': row['label']
    }
    return document

In [73]:
import random
from nanoid import generate
# Alfabeto utilizzato da nanodi per generare gli ID dei HIT
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

# Funzione per generare un HIT
def hit_factory(index:int):

    documents = []

    # L'obiettivo è selezionare d'apprima una coppia di righe tra le tre, queste avranno le spiegazioni generate
    model_df = df.sample(2)
    # E successivamente selezionare un altra riga sempre tra le tre, ques'ultima avrà la spiegazione umana
    human_df = df.sample()

    # Aggiungo i due elementi con explanation_model all'HIT
    for _, row in model_df.iterrows():
        document = document_factory(row,False)
        documents.append(document)
    # Aggiungo l'elemento con explanation_human all'HIT
    for _, row in human_df.iterrows():
        document = document_factory(row,True)
        documents.append(document)

    # Mescolo casualmente gli elementi dell'HIT
    random.shuffle(documents)

    # Creo l'HIT
    hit = {
        'unit_id': f'unit_{index}',
        'token_input': generate(alphabet,size=11), # Genera ID di 11 caratteri
        'token_output': generate(alphabet,size=11), # Genera ID di 11 caratteri
        'documents_number': len(documents),
        'documents': documents
    }
    
    return hit

In [74]:
# Inizializzo l'array dove mettero gli HITs
hits = []

# Genero i 12 HITs
for i in range(12):
    hit = hit_factory(i)
    hits.append(hit)

serealize_json(".","hits",hits)

Data serialized to path: ./hits.json


# Analisi dei dati

In [94]:
import pandas as pd
import re
from difflib import SequenceMatcher
from itertools import combinations

## Punto 9

In [95]:
def percent_agreement(v1,v2):
    if v1 != v2:
        return (100 - (abs(v1 - v2))/(v1 + v2)*100)
    else:
        return 100

In [96]:
# L'obiettivo è quello di raggruppare il dataset per explanation, successivamente per ogni explanation selezionare i worker che l'hanno fatto e tra essi creare le coppie
# Successivamente tra queste coppie calcolare il Percent Agreement per truthfulness-1 e truthfulness-2
# Calcolare la media e andare avanti
df = pd.read_csv("csv/workers_answers.csv")
df = df[['doc_explanation', 'worker_id', 'doc_truthfulness-1_value', 'doc_truthfulness-2_value']]

groups = df.groupby(by=['doc_explanation'])
result = pd.DataFrame()
for name, group in groups:
    workers = list(group['worker_id'])
    group = group.drop_duplicates()
    group = group.dropna()
    group = group.set_index('worker_id')
    # Creo le combinazioni tra worker
    worker_couples = list(combinations(workers,2))
    # Imposto i due percent agreement a 0 in modo da fare la media alla fine
    total_percent_agreement1 = 0
    total_percent_agreement2 = 0
    for couple in worker_couples:
        # Estraggo i valori
        worker1_truthfulness1 = float(group['doc_truthfulness-1_value'].loc[couple[0]])
        worker2_truthfulness1 = float(group['doc_truthfulness-1_value'].loc[couple[1]])
        # Estraggo i valori
        worker1_truthfulness2 = float(group['doc_truthfulness-2_value'].loc[couple[0]])
        worker2_truthfulness2 = float(group['doc_truthfulness-2_value'].loc[couple[1]])
        # Trovo i due percent_agreement
        percent_agreement1 = percent_agreement(worker1_truthfulness1,worker2_truthfulness1)
        percent_agreement2 = percent_agreement(worker1_truthfulness2,worker2_truthfulness2)
        # Sommo i percent agreement calcolati al totale
        total_percent_agreement1 += percent_agreement1
        total_percent_agreement2 += percent_agreement2
    n = len(worker_couples)
    mean_percent_agreement1 = total_percent_agreement1/n
    mean_percent_agreement2 = total_percent_agreement2/n
    row = {
        'explanation': name,
        'percent_agreement_truthfulness1': mean_percent_agreement1,
        'percent_agreement_truthfulness2': mean_percent_agreement2
    }
    result = result.append(row, ignore_index=True)

  result = result.append(row, ignore_index=True)
  result = result.append(row, ignore_index=True)
  result = result.append(row, ignore_index=True)
  result = result.append(row, ignore_index=True)
  result = result.append(row, ignore_index=True)
  result = result.append(row, ignore_index=True)


## Punto 10

In [44]:
def explanation_mapping(text, explanation_series):
    for i, explanation in explanation_series.iteritems():
        ratio = SequenceMatcher(None, str(text), str(explanation)).ratio()
        if (ratio > 0.9):
            return i

In [45]:
explanation_df = pd.read_csv("group_5-Palma-Sacchet-Sagliocca.csv")
explanation_df = explanation_df[['explanation_human', 'explanation_model']]
explanation_df = explanation_df['explanation_human'].append(explanation_df['explanation_model']).reset_index(drop=True)
explanation_df[0]

  explanation_df = explanation_df['explanation_human'].append(explanation_df['explanation_model']).reset_index(drop=True)


'Saw II sold over 3 million units, however it does not say whether it sold them on DVD.'

In [46]:
# Carico il file csv da dove ricavo i dati
note_df = pd.read_csv("csv/workers_notes.csv")
note_df['note_text_left'] = note_df['note_text_left'].fillna("")
note_df['note_text_right'] = note_df['note_text_right'].fillna("")
note_df['note_text'] = note_df['note_text_left'].astype(str) + note_df['note_text_raw'].astype(str) + note_df['note_text_right'].astype(str)
# Converto le frasi in numeri (contando il numero di parole). Utilizzo regex per non considerare la punteggiatura
note_df['note_text_raw_count'] = note_df['note_text_raw'].apply(lambda x: len(re.findall(r'\w+', str(x))))
note_df['note_text_count'] = note_df['note_text'].apply(lambda x: len(re.findall(r'\w+', str(x))))
note_df['percentual'] = (note_df['note_text_raw_count'] / note_df['note_text_count']) * 100
note_df['explanation_id'] = note_df['note_text'].apply(lambda text: explanation_mapping(text, explanation_df))
note_df = note_df.groupby(by='explanation_id').mean().reset_index()
note_df = pd.DataFrame({
    'explanation': explanation_df,
    'percentual_mean': note_df['percentual']
})

KeyError: '[nan] not in index'

In [11]:
# Filtro solo le colonne che mi interessano
note_df = dataframe[['note_text_raw', 'note_text_left', 'note_text_right']]
# Converto le frasi in numeri (contando il numero di parole). Utilizzo regex per non considerare la punteggiatura
note_df_num = note_df.applymap(lambda x: len(re.findall(r'\w+', str(x))))
# Aggiungo una nuova colonna con il totale delle parole
note_df_num['note_text_total'] = str(note_df_num['note_text_left']) + str(note_df_num['note_text_raw']) + str(note_df_num['note_text_right'])
# Aggiungo una nuova colonna con il valore percentuale
note_df_num['percentual'] = (note_df_num['note_text_raw'] / note_df_num['note_text_total']) * 100
note_df_num['worker_id'] = dataframe['worker_id']
note_df_num

NameError: name 'dataframe' is not defined

# Punto 12

In [94]:
dataframe = pd.read_csv("csv/workers_dimensions_selection.csv")

# Utilizzo la funzione groupby pre raggruppare per dimension, dopo di chè 
dataframe.groupby(by=["worker_id", "document_id"]).sum().groupby(by="document_id").mean()

Unnamed: 0_level_0,paid,try_last,try_current,dimension_index,timestamp_start,selection_index,selection_value,selection_timestamp,selection_time_elapsed,timestamp_end,document_index
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1520,6.2,6.2,6.2,5.6,10374300000.0,9.8,125.8,10374300000.0,443.3572,10374300000.0,8.0
25842,5.333333,5.333333,5.333333,5.111111,8924056000.0,12.333333,224.777778,8924057000.0,301.741667,8924057000.0,6.0
41947,4.5,4.5,4.5,4.333333,7529652000.0,5.583333,134.0,7529653000.0,304.114833,7529653000.0,3.833333
