# Merging Annotated Reviews
The first section is for merging in newly annotated data into the `master_df_labeled.xlsx` file, which is the final step before the datset is put into dataloaders. This can be rerun anytime new batches have been annotated. The other two sections remain as documentation on how the batches for annotation were created.

In [1]:
import pandas as pd
import pickle
import os
import nltk
import re
#nltk.download('wordnet')
import random
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from tqdm import tqdm_notebook as tqdm

## Workflow for Merging in New Annotations
1. Print current true and false positive counts
2. After labeling, and putting labeled data in appropriate folder, run `batch_merge_into_master`
3. Check for any annotation errors, correct them
4. Save dataframe back into `master_file.xlsx`

In [3]:
#All excel file paths here, adjust path and directory names as necessary
master_file = "../data/master_df_labeled.xlsx"
#Annotated data should be saved into the batch_labeled folder
batch_labeled = "../data/FairFrame Annotations/labeled/"
greats_folder = "../data/FairFrame Annotations/greats_batches/"
non_greats_folder = "../data/FairFrame Annotations/non_greats_batches/"

In [2]:
#Update our running master_df_labeled with multiple new files
def batch_merge_into_master(master_file, batch_folder):
  batch_files = os.listdir(batch_folder)
  print(batch_files)
  master_df_labeled = pd.read_excel(master_file, index_col = 0)
  for new_file in batch_files:
    print(new_file)
    if re.match('batch_[0-9]*', new_file):
      new_df = pd.read_excel(os.path.join(batch_folder, new_file), index_col = 0)
      master_df_labeled.update(new_df.true_pos)
      print_tf_pos(master_df_labeled)
  #master_df_labeled.to_excel(master_file)
  print("Please save master_df_labeled!")
  return master_df_labeled

#Print out how many examples are labeled as True or False Positives
def print_tf_pos(master_df_labeled):
  print(f"Count of true positives: {master_df_labeled[master_df_labeled.true_pos==1].shape[0]}")
  print(f"Count of false positives: {master_df_labeled[master_df_labeled.true_pos==0].shape[0]}")
  assert master_df_labeled[master_df_labeled.problematic==0].shape[0]==0

In [18]:
#Load and print current true and false positive counts
master_df_labeled = pd.read_excel(master_file,index_col=0)

print_tf_pos(master_df_labeled)
print(master_df_labeled.shape)

Count of true positives: 4647
Count of false positives: 715
(111029, 6)


In [None]:
#Merge in all annotated files
master_df_labeled = batch_merge_into_master(master_file, batch_labeled)
print_tf_pos(master_df_labeled)
print(master_df_labeled.shape)

In [19]:
#Check for mistakes in annotation
master_df_labeled[master_df_labeled.true_pos.isin([1,0,-1])==False]

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos


In [None]:
#Currently there exists 11's that actually should be 1's, correct them here, rerun previous cell to check afterwards
master_df_labeled.true_pos[master_df_labeled.true_pos==11]=master_df_labeled.true_pos[master_df_labeled.true_pos==11].replace(11,1)

In [17]:
#Save the merged dataset
master_df_labeled.to_excel(master_file)

## Resampling for less common examples
This code should not be run again, unless there is need to resample

In [17]:
filter_words = ["great", "outstanding", "excellent", "professional", "creative", "experience"]
batch_nums = range(200,1110)

In [18]:
df_list = []
for b in batch_nums:
  f_name = "batch_"+str(b)+".xlsx"
  df_list.append(pd.read_excel(os.path.join(batch_folder, f_name), index_col=0))


In [None]:
concat_df = pd.concat(df_list)

In [21]:
concat_df["lemmatized"] = [wordnet_lemmatizer.lemmatize(w) for w in concat_df.flagged_word]

In [None]:
greats_df = concat_df[concat_df.lemmatized.isin(filter_words)]
nongreats_df = concat_df[~concat_df.lemmatized.isin(filter_words)]
#nongreats_df

In [33]:
#Create fixed chunks of shuffled data for people to label
def batch_please_label_greats(df, batch_folder, cat, sample_num=100):
  df = df.drop(labels=["lemmatized"],axis=1)
  row_num = df.shape[0]
  batches = row_num//sample_num
  print(batches)
  #assert 1==0
  
  for start_bin in range(batches):
    end_bin = start_bin+1
    temp = df.iloc[start_bin*sample_num:end_bin*sample_num,:]
    if end_bin == batches and row_num%sample_num>0:#we have leftover rows to deal with
      extra = df.iloc[end_bin*sample_num:,:]
      f_name = "batch_"+str(end_bin)+"_"+cat+".xlsx"
      extra.to_excel(os.path.join(batch_folder, f_name))
    f_name = "batch_"+str(start_bin)+"_"+cat+".xlsx"
    temp.to_excel(os.path.join(batch_folder, f_name))
  print(f"Full Batches: {batches}")
  print(f"Partial Batch?: {row_num%sample_num>0}")
  print(f"Go check {batch_folder}")

In [None]:
batch_please_label_greats(greats_df,greats_folder, "greats")
batch_please_label_greats(nongreats_df,non_greats_folder, "non_greats")

## Initial Labeling Process
This code is here in case there is ever a need to revert back to the first set of annotated data, or for debugging purposes.

In [33]:
#batch_folder = "../data/please_label_batch/100/"
master_df = pickle.load(open(master_file, "rb"))
master_df["lemmatized"] = [wordnet_lemmatizer.lemmatize(w) for w in master_df.flagged_word]
#
print(len(set(master_df.flagged_word)))
print(len(set(master_df.lemmatized)))
#print(set(master_df.flagged_word)-set(master_df.lemmatized))
# print(sorted(list(set(master_df.lemmatized)),key=str.lower))
# master_df.shape

524
448


In [8]:
# #creating random sample from master_df to label
# sampled = master_df[master_df.problematic==1].sample(500)
# sampled.to_excel("../data/fptp.xlsx")

In [34]:
#merge by row index
labeled = pd.read_excel("../data/fptp_labeled.xlsx",index_col=0)
master_df_labeled = master_df.join(labeled.true_pos).fillna(-1)
master_df_labeled = master_df_labeled[master_df_labeled.problematic==1]
#master_df_labeled
print_tf_pos(master_df_labeled)

Count of true positives: 84
Count of false positives: 31


In [35]:
#master file to continuously update
master_df_labeled.to_excel(master_df_labeled)
pickle.dump(master_df_labeled,open(master_file,"wb"))

In [None]:
#Generates new sample of data to be labeled
def please_label(master_file, new_file, sample_num=100):
  master_df = pd.read_excel(master_file,index_col=0)
  to_label_df = master_df[master_df.true_pos==-1]
  output_df = to_label_df.sample(sample_num)
  output_df.to_excel(new_file)
  return output_df

#Create fixed chunks of shuffled data for people to label
def batch_please_label(master_file, batch_folder, sample_num=100):
  master_df = pd.read_excel(master_file,index_col=0).drop(labels=["lemmatized","problematic"],axis=1)
  to_label_df = master_df[master_df.true_pos==-1].sample(frac=1,random_state=0)
  row_num = to_label_df.shape[0]
  batches = row_num//sample_num
  print(batches)
  #assert 1==0
  
  for start_bin in range(batches):
    end_bin = start_bin+1
    temp = to_label_df.iloc[start_bin*sample_num:end_bin*sample_num,:]
    if end_bin == batches and row_num%sample_num>0:#we have leftover rows to deal with
      extra = to_label_df.iloc[end_bin*sample_num:,:]
      extra.to_excel(batch_folder+"batch_"+str(end_bin)+".xlsx")
    temp.to_excel(batch_folder+"batch_"+str(start_bin)+".xlsx")
  print(f"Full Batches: {batches}")
  print(f"Partial Batch?: {row_num%sample_num>0}")
  print(f"Go check {batch_folder}")

In [None]:
#Update our running master_df_labeled, main function to produce files for dataloaders
def merge_into_master(master_file, new_file, grounds_file, batch=False):
  master_df_labeled = pd.read_excel(master_file,index_col=0)
  new_df = pd.read_excel(new_file,index_col=0)

  master_df_labeled.update(new_df.true_pos)
  master_df_labeled.to_excel(master_file)
  
  #pickle.dump(master_df_labeled,open("../data/master_df_labeled.p","wb"))
  return master_df_labeled
  

#Produces ground_truths_df
def produce_ground_truths(master_df_labeled, grounds_file):
  ground_truths_df = master_df_labeled[master_df_labeled.true_pos!=-1]
  ground_truths_df.to_excel(grounds_file)
  pickle.dump(ground_truths_df,open("../data/ground_truths_df.p","wb"))
  return ground_truths_df