In [75]:
import pandas as pd
import pickle
import os
import nltk
import re
#nltk.download('wordnet')
import random
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from tqdm import tqdm_notebook as tqdm

In [118]:
#Generates new sample of data to be labeled
def please_label(master_file, new_file, sample_num=100):
  master_df = pd.read_excel(master_file,index_col=0)
  to_label_df = master_df[master_df.true_pos==-1]
  output_df = to_label_df.sample(sample_num)
  output_df.to_excel(new_file)
  return output_df

#Create fixed chunks of shuffled data for people to label
def batch_please_label(master_file, batch_folder, sample_num=100):
  master_df = pd.read_excel(master_file,index_col=0).drop(labels=["lemmatized","problematic"],axis=1)
  to_label_df = master_df[master_df.true_pos==-1].sample(frac=1,random_state=0)
  row_num = to_label_df.shape[0]
  batches = row_num//sample_num
  print(batches)
  #assert 1==0
  
  for start_bin in range(batches):
    end_bin = start_bin+1
    temp = to_label_df.iloc[start_bin*sample_num:end_bin*sample_num,:]
    if end_bin == batches and row_num%sample_num>0:#we have leftover rows to deal with
      extra = to_label_df.iloc[end_bin*sample_num:,:]
      extra.to_excel(batch_folder+"batch_"+str(end_bin)+".xlsx")
    temp.to_excel(batch_folder+"batch_"+str(start_bin)+".xlsx")
  print(f"Full Batches: {batches}")
  print(f"Partial Batch?: {row_num%sample_num>0}")
  print(f"Go check {batch_folder}")
    
  
#Update our running master_df_labeled with multiple new files
def batch_merge_into_master(master_file, batch_folder):
  batch_files = os.listdir(batch_folder)
  print(batch_files)
  master_df_labeled = pd.read_excel(master_file,index_col=0)
  #master_df_labeled = pickle.load(open(master_file, "rb"))
  for new_file in batch_files:
    print(new_file)
    if re.match('batch_[0-9]*\.xlsx',new_file):
      new_df = pd.read_excel(batch_folder+new_file,index_col=0)
      master_df_labeled.update(new_df.true_pos)
      #master_df_labeled = merge_into_master(master_file, batch_folder+new_file, batch=True)
      print_tf_pos(master_df_labeled)
  master_df_labeled.to_excel(master_file)
  #pickle.dump(master_df_labeled,open(master_file,"wb"))
  return master_df_labeled

#Update our running master_df_labeled, main function to produce files for dataloaders
def merge_into_master(master_file, new_file, grounds_file, batch=False):
  master_df_labeled = pd.read_excel(master_file,index_col=0)
  new_df = pd.read_excel(new_file,index_col=0)

  master_df_labeled.update(new_df.true_pos)
  master_df_labeled.to_excel(master_file)
#   if not batch:
#     ground_truths_df = produce_ground_truths(master_df_labeled, grounds_file)
#     master_df_labeled.to_excel(master_file)
#     pickle.dump(master_df_labeled,open("./data/master_df_labeled.p","wb"))
#     return master_df_labeled, ground_truths_df
  
  pickle.dump(master_df_labeled,open("./data/master_df_labeled.p","wb"))
  return master_df_labeled
  

#Produces ground_truths_df
def produce_ground_truths(master_df_labeled, grounds_file):
  ground_truths_df = master_df_labeled[master_df_labeled.true_pos!=-1]
  ground_truths_df.to_excel(grounds_file)
  pickle.dump(ground_truths_df,open("./data/ground_truths_df.p","wb"))
  return ground_truths_df

#Print out how many examples are labeled as True or False Positives
def print_tf_pos(master_df_labeled):
  print(f"Count of true positives: {master_df_labeled[master_df_labeled.true_pos==1].shape[0]}")
  print(f"Count of false positives: {master_df_labeled[master_df_labeled.true_pos==0].shape[0]}")
  assert master_df_labeled[master_df_labeled.problematic==0].shape[0]==0

In [71]:
#All excel file paths here, pickle paths are used by other scripts so hardcoded
master_file = "./data/master_df_labeled.xlsx"
#new sampled data to be labeled
new_file = "./data/please_label.xlsx"
grounds_file = "./data/ground_truths.xlsx"
batch_folder = "./data/please_label_batch/100/"
#batch_labeled = "./data/please_label_batch/labeled/"
batch_labeled = "/Users/echo/Google Drive File Stream/My Drive/2019 Fall/Capstone group/FairFrame Annotations/labeled/"

# Example Workflow
1. Print current true and false positive counts
2. After labeling, update our running master file (merge_into_master)
3. Print new true and false positive counts to check, also select specific example to check

In [125]:
#Print current true and false positive counts
#master_df_labeled.to_excel("./data/master_df_labeled.xlsx")
master_df_labeled = pd.read_excel(master_file,index_col=0)

print_tf_pos(master_df_labeled)
print(master_df_labeled.shape)

Count of true positives: 471
Count of false positives: 631
(111029, 6)


In [124]:
master_df_labeled = batch_merge_into_master("./data/master_df_labeled.xlsx", batch_labeled)
print_tf_pos(master_df_labeled)
print(master_df_labeled.shape)

['batch_2.xlsx', 'batch_3.xlsx', 'batch_7.xlsx', 'batch_1.xlsx', 'batch_0.xlsx', 'batch_9.xlsx', 'batch_10.xlsx', 'batch_11.xlsx', 'batch_5.xlsx', 'batch_6.xlsx', 'batch_12.xlsx', 'Icon\r']
batch_2.xlsx
Count of true positives: 85
Count of false positives: 99
batch_3.xlsx
Count of true positives: 124
Count of false positives: 147
batch_7.xlsx
Count of true positives: 156
Count of false positives: 210
batch_1.xlsx
Count of true positives: 196
Count of false positives: 261
batch_0.xlsx
Count of true positives: 237
Count of false positives: 308
batch_9.xlsx
Count of true positives: 284
Count of false positives: 358
batch_10.xlsx
Count of true positives: 321
Count of false positives: 411
batch_11.xlsx
Count of true positives: 349
Count of false positives: 472
batch_5.xlsx
Count of true positives: 384
Count of false positives: 531
batch_6.xlsx
Count of true positives: 434
Count of false positives: 577
batch_12.xlsx
Count of true positives: 471
Count of false positives: 631
Icon
Count of tru

In [116]:
master_df_labeled.to_excel("./data/master_df_labeled.xlsx")
pickle.dump(master_df_labeled,open(master_file,"wb"))

In [139]:
#Generate fixed batch to label
#batch_please_label(master_file, batch_folder, sample_num=100)

1109
Full Batches: 1109
Partial Batch?: True
Go check ./data/please_label_batch/100/


# Initial Labeling Process

In [120]:
master_df = pickle.load(open("./data/master_df.p", "rb"))
master_df["lemmatized"] = [wordnet_lemmatizer.lemmatize(w) for w in master_df.flagged_word]
#
print(len(set(master_df.flagged_word)))
print(len(set(master_df.lemmatized)))
#print(set(master_df.flagged_word)-set(master_df.lemmatized))
# print(sorted(list(set(master_df.lemmatized)),key=str.lower))
# master_df.shape

524
448


In [8]:
# #creating random sample from master_df to label
# sampled = master_df[master_df.problematic==1].sample(500)
# sampled.to_excel("./data/fptp.xlsx")

In [121]:
#merge by row index
labeled = pd.read_excel("./data/fptp_labeled.xlsx",index_col=0)
master_df_labeled = master_df.join(labeled.true_pos).fillna(-1)
master_df_labeled = master_df_labeled[master_df_labeled.problematic==1]
#master_df_labeled
print_tf_pos(master_df_labeled)



Count of true positives: 52
Count of false positives: 44


In [122]:
#master file to continuously update
master_df_labeled.to_excel("./data/master_df_labeled.xlsx")
pickle.dump(master_df_labeled,open("./data/master_df_labeled.p","wb"))