In [None]:
import os
import pandas as pd 
import numpy as np 

# Airquality - CO

In [None]:
data_path = os.path.join(os.getcwd(), "data_preparation/raw_dataset/", "AirQualityUCI/AirQualityUCI.csv")
data = pd.read_csv(data_path, delimiter=";")
data.head()

In [None]:
data = data.dropna(axis=0, how="all")
data = data.dropna(axis=1, how="all")

In [None]:
timeslots = {"morning"  : "12:00:00", 
             "afternoon": "17:00:00", 
             "evening"  : "21:00:00", 
             "night"    : "04:00:00" }

def getTimeSlots(value, timeslots):
    if value > timeslots['night'] and value < timeslots['morning']:
        return 'morning'
    elif value > timeslots['morning'] and value < timeslots['afternoon']:
        return 'afternoon'
    elif value > timeslots['afternoon'] and value < timeslots['evening']:
        return 'evening'
    else:
        return 'night'

import datetime
def getDayFromDate(value):
    day,month,year = (int(x) for x in value.split('/'))
    day_of_week = datetime.date(year, month, day)
    return day_of_week.strftime("%A").lower()

data['time_slot'] = data['Time'].apply(lambda x: getTimeSlots(x, timeslots))
data['day_of_week'] = data['Date'].apply(lambda x: getDayFromDate(x))

In [None]:
data = data.drop(['Time', 'Date'], axis = 1)
data_copy = data.copy()
data = data.reindex(columns=['time_slot', 'day_of_week', 'T', 'RH', 'AH',
       'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'CO(GT)', 'PT08.S1(CO)'])

rename_column = {'CO(GT)': "true_label", 'PT08.S1(CO)':'given_label'}
data = data.rename(columns=rename_column)
data.head(2) 


### Drop rows with -200
Missing values were marked with -200 in original dataset. 

In [None]:
data = data[data.given_label != -200]
data = data[data.true_label != -200]
data["given_label"].isin([-200]).sum(), data.shape

### Standardize

In [None]:
mean_given_label = data["given_label"].mean()
std_given_label = data["given_label"].std()
data['given_label'] = (data["given_label"] - mean_given_label)/std_given_label

mean_true_label = data["true_label"].mean()
std_true_label = data["true_label"].std()
data['true_label'] = (data["true_label"] - mean_true_label)/std_true_label

data.head()

# Airquality - NO2

In [None]:
data_no2 = data_copy.copy()

In [None]:
data_no2 = data_no2.reindex(columns=['time_slot', 'day_of_week', 'T', 'RH', 'AH',
    'PT08.S2(NMHC)', 'PT08.S3(NOx)','PT08.S5(O3)', 'PT08.S1(CO)', 'NO2(GT)', 'PT08.S4(NO2)'])

rename_column = { 'NO2(GT)':"true_label", 'PT08.S4(NO2)': 'given_label'}
data_no2 = data_no2.rename(columns=rename_column)
data_no2.head(2) 

In [None]:
print(data_no2["given_label"].isin([-200]).sum(), data_no2.shape)
data_no2 = data_no2[data_no2.given_label != -200]
data_no2 = data_no2[data_no2.true_label != -200]
data_no2["given_label"].isin([-200]).sum(), data_no2.shape

### Standardize

In [None]:
mean_given_label = data_no2["given_label"].mean()
std_given_label = data_no2["given_label"].std()
data_no2['given_label'] = (data_no2["given_label"] - mean_given_label)/std_given_label

mean_true_label = data_no2["true_label"].mean()
std_true_label = data_no2["true_label"].std()
data_no2['true_label'] = (data_no2["true_label"] - mean_true_label)/std_true_label

data_no2.head()

# Stanford Politeness (WIKI)

In [None]:
import pandas as pd 
from convokit import Corpus, download
wiki_corpus = Corpus(filename=download("wikipedia-politeness-corpus"))

In [None]:
wiki_data = wiki_corpus.get_utterances_dataframe()

def getLabelsList(annot_dict: pd.DataFrame, num_annotators = 5): 
    alist = []
    for item in annot_dict:
        len_item = len(list(item.values())) 
        if  len_item < num_annotators: 
            to_be_added = num_annotators - len_item
            alist.append(list(item.values()))
            for _ in range(to_be_added):
                alist.append(None)
        else:
            alist.append(list(item.values())) 
    return alist 

list_of_labels = getLabelsList(wiki_data["meta.Annotations"])
df_of_labels = pd.DataFrame(list_of_labels, columns=["anot_"+str(x+1) for x in range(5)])
df_of_labels.head()

In [None]:
def getAgreement(list_of_labels: list, num_annotator = 5):
    agreementList = []
    temp_dict = {}
    for label in list_of_labels:
        if label in list(temp_dict.keys()):
            temp_dict[label] += 1
        else:
            temp_dict[label] = 1
    for key, val in temp_dict.items():
        if val >= (num_annotator//2)+1: 
            return key      
    return None
    
df_of_labels['agreement'] = df_of_labels.apply(lambda x: getAgreement(x),raw=True, axis = 1)


In [None]:
df_of_labels["true_label"] = df_of_labels[['anot_1', 'anot_2', 'anot_3', 'anot_4', 'anot_5']].median(axis=1, skipna=True)
label_dataframe = df_of_labels.copy()
df_of_labels.head()

In [None]:
for i in range(5):
    df_of_labels["residual_"+str(i+1)] = df_of_labels["anot_"+str(i+1)] - df_of_labels['true_label']

df_of_residual = df_of_labels[['residual_1', 'residual_2', 'residual_3', 'residual_4', 'residual_5']]
df_residual_index = df_of_residual.apply(lambda x: np.argmax(abs(x)), axis=1, raw=True)

for i in range(df_of_labels.shape[0]):
    df_of_labels.loc[i, 'given_label'] = df_of_labels.iloc[i][int(df_residual_index.iloc[i])] 

new_column = ['anot_1', 'anot_2', 'anot_3', 'anot_4', 'anot_5', 
                'residual_1', 'residual_2', 'residual_3', 'residual_4','residual_5', 
                'agreement','true_label', 'given_label']
df_of_labels = df_of_labels.reindex(columns=new_column)
df_of_labels.head()

In [None]:
wiki_data.reset_index(inplace= True)
wiki_data = pd.concat([wiki_data, df_of_labels], axis=1)
wiki_data.head(2)

In [None]:
wiki_data.drop(['meta.Normalized Score', 'meta.Annotations','meta.parsed', 'vectors'], axis=1, inplace=True)
wiki_data = wiki_data.dropna(subset=['agreement'], axis=0)
wiki_data.head(2)

In [None]:
toRemove = ['id','anot_1', 'anot_2', 'anot_3', 'anot_4', 'anot_5',
            'residual_1', 'residual_2', 'residual_3', 'residual_4', 'residual_5',
            'speaker', 'reply_to', 'timestamp', 'conversation_id']
wiki_data.drop(columns=toRemove, axis=1, inplace=True)
wiki_data.dropna(axis=1)
wiki_data.drop(axis=1, columns=['agreement'], inplace=True)
wiki_data.head()

In [None]:
wiki_data = wiki_data.reset_index(drop=True)

## If given label are randomly selected 

In [None]:
label_dataframe.head()
modified_label_dataframe = label_dataframe.copy()
modified_label_dataframe['given_label'] = label_dataframe[['anot_1', 'anot_2', 'anot_3', 'anot_4', 'anot_5']].apply(lambda x: np.random.choice(x), raw=True, axis=1)
modified_label_dataframe.head()

In [None]:
wiki_data_ref = wiki_corpus.get_utterances_dataframe()
wiki_data_ref = wiki_data_ref.reset_index(drop=True)
wiki_data_ref.head(2)

In [None]:
random_wiki = pd.concat([wiki_data_ref, modified_label_dataframe], axis=1)
random_wiki.head(2)

In [None]:
random_wiki = random_wiki.drop(columns=['timestamp', 'speaker', 'reply_to', 'conversation_id','meta.Normalized Score',
                                        'meta.Annotations','meta.parsed', 'vectors', 'anot_1', 'anot_2', 'anot_3', 'anot_4','anot_5'])
random_wiki.head()            

In [None]:
random_wiki = random_wiki.dropna(axis=0)
random_wiki = random_wiki.reset_index(drop=True)
random_wiki = random_wiki.drop(columns=['agreement'])
random_wiki.head()

# Stanford Politeness (Stack)

In [None]:
from convokit import Corpus, download
stack_corpus = Corpus(filename=download("stack-exchange-politeness-corpus"))
stack_data = stack_corpus.get_utterances_dataframe()

In [None]:
list_of_labels = getLabelsList(stack_data["meta.Annotations"])
df_annotator = pd.DataFrame(list_of_labels, columns=["anot_"+str(x+1) for x in range(5)])
df_annotator.head()

In [None]:
df_labels = pd.DataFrame(columns=['agreement', 'true_label', 'given_label'])
df_labels['true_label'] = df_annotator.median(axis=1, skipna=True)
df_labels['agreement'] = df_annotator.apply(lambda x: getAgreement(x),raw=True, axis = 1)
df_labels.head()

In [None]:
final_annotation = pd.concat([df_annotator, df_labels], axis=1)
final_annotation.head()

### when given_label is considered as furthest 

In [None]:
for i in range(5):
    df_annotator["residual_"+str(i+1)] = df_annotator["anot_"+str(i+1)] - df_labels['true_label']

df_residual = df_annotator[['residual_'+str(i+1) for i in range(5)]]
df_residual_index = df_residual.apply(lambda x: np.argmax(abs(x)), axis=1, raw=True)
for i in range(df_annotator.shape[0]):
    df_labels.loc[i, 'given_label'] = df_annotator.iloc[i][int(df_residual_index.iloc[i])] 


In [None]:
stack_data.reset_index(inplace=True)
stack_data = pd.concat([stack_data, df_annotator, df_labels], axis=1)
stack_data.head(2)

In [None]:
stack_data.columns
stack_data.drop(['meta.Normalized Score', 'meta.Annotations','meta.parsed', 'vectors', 'id'], axis=1, inplace=True)
stack_data.head(2)

In [None]:
toRemove = ['timestamp', 'speaker', 'reply_to', 'conversation_id',
            'anot_1', 'anot_2', 'anot_3', 'anot_4', 'anot_5',
            'residual_1', 'residual_2', 'residual_3', 'residual_4', 'residual_5']
stack_data.drop(columns=toRemove, axis=1, inplace=True)
stack_data.head()

In [None]:
stack_data = stack_data.dropna(axis=0)
stack_data = stack_data.drop(columns=['agreement'])
stack_data = stack_data.reset_index(drop=True)
stack_data.shape

## Preprocessing using Hugging Face Sentence Encoder
Our final dataset with this encoding considers random selection of labels as given label.
Lets first prepare the dataset with randomly selected given label. 

In [None]:
random_annotation = final_annotation.copy()
random_annotation['given_label'] = random_annotation[[col for col in random_annotation.columns if col.startswith('anot')]].apply(lambda x: np.random.choice(x), raw=True, axis=1)
random_annotation.head()

In [None]:
random_stack = stack_corpus.get_utterances_dataframe()
random_stack = random_stack.reset_index(drop=True)
random_stack.head(2)

In [None]:
random_stack = pd.concat([random_stack,random_annotation], axis=1)
random_stack = random_stack.drop(columns=['timestamp', 'speaker', 'reply_to', 'conversation_id',
       'meta.Normalized Score', 'meta.Annotations','meta.parsed', 'vectors', 'anot_1', 'anot_2', 'anot_3', 'anot_4',
       'anot_5'])
random_stack.head(2)

In [None]:
random_stack = random_stack.dropna(axis=0)
random_stack = random_stack.drop(columns=['agreement'])
random_stack = random_stack.reset_index(drop=True)
random_stack.head()

### Let's use Hugging face Sentence Encoder 

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.max_seq_length = 500

In [None]:
embedding = model.encode(random_stack.text)
embedding.shape

In [None]:
to_keep = random_stack[["meta.Binary","true_label","given_label"]]

# Embedding Dataframe
col = ["col_"+str(i+1) for i in range(embedding.shape[1])]
embedding_df = pd.DataFrame(embedding, columns=col)

In [None]:
final_stack_random = pd.concat([to_keep, embedding_df], axis=1)
final_stack_random.head()

# Metaphor Novelty

In [None]:
# load trained fastext model
import fasttext
model = fasttext.load_model("./support_files/trained_fasttext.bin")

In [None]:
loadpath = "./data_preparation/raw_dataset/Metaphor_Novelty/metaphor_novelty_test.csv"
metaphor_data = pd.read_csv(loadpath)
metaphor_data.head()

In [None]:
def splitStrings(x: str, combined = True):
    temp = []
    for substr in x.split("_"):
        if not substr.isdigit() and (substr != ""):
            temp.append(substr.lower())
    
    return " ".join(temp) if combined else temp


In [None]:
metaphor_data['text'] = metaphor_data['ID'].apply(lambda x: splitStrings(x))
metaphor_data[['word1', 'word2']] = metaphor_data['text'].str.split(expand=True)
metaphor_data.head()

In [None]:
## Dataset - Average for given label 
metaphor_data = metaphor_data.drop(["ID", "text"], axis=1)
metaphor_data['given_label'] = metaphor_data[["A1", "A2", "A3", "A4", "A5"]].apply(lambda x: np.mean(x), axis = 1, raw = True)
metaphor_data = metaphor_data.drop(["A1", "A2", "A3", "A4", "A5"], axis=1)

metaphor_data['vector1'] = metaphor_data["word1"].apply(lambda x: model[x])
metaphor_data['vector2'] = metaphor_data["word2"].apply(lambda x: model[x])
metaphor_data['diff'] = metaphor_data['vector1'] - metaphor_data['vector2']
metaphor_data['abs_diff'] = abs(metaphor_data['diff'])
metaphor_data.head()

In [None]:
vector1_df = metaphor_data.vector1.apply(pd.Series)
vector1_df.columns = ["vector1_"+str(i+1) for i in range(vector1_df.shape[1]) ]
vector2_df = metaphor_data.vector2.apply(pd.Series)
vector2_df.columns = ["vector2_"+str(i+1) for i in range(vector2_df.shape[1])]

vector = pd.concat([vector1_df, vector2_df], axis=1)
metaphor_labels = metaphor_data[["given_label", "Score"]]
final_metaphor = pd.concat([metaphor_labels, vector], axis=1) 
final_metaphor.head()

# Telomere

In [None]:
loadpath = "./data_preparation/raw_dataset/qPCR telomere/qPCR_telomere.csv"
telomere_data = pd.read_csv(loadpath, index_col=0)
telomere_data.head()

In [None]:
telomere_cq_telo = telomere_data[["true.dna.scg", "true.telo.var", "true.telo.cq", "measured.cq.telo"]]
telomere_cq_telo = telomere_cq_telo.rename(columns={"true.telo.cq": "true_label", "measured.cq.telo":"given_label"})
print(telomere_cq_telo.shape)
telomere_cq_telo.head()