In [78]:
import pandas as pd
import pickle
import os
import nltk
#nltk.download('wordnet')
import random
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [90]:
#All excel file paths here, pickle paths are used by other scripts so hardcoded
master_file = "./data/master_df_labeled.xlsx"
#new sampled data to be labeled
new_file = "./data/please_label.xlsx"
grounds_file = "./data/ground_truths.xlsx"

In [108]:
#Generates new sample of data to be labeled
def please_label(master_file, new_file, sample_num=100):
  master_df = pd.read_excel(master_file,index_col=0)
  to_label_df = master_df[master_df.true_pos==-1]
  output_df = to_label_df.sample(sample_num)
  output_df.to_excel(new_file)
  return output_df

#Update our running master_df_labeled, main function to produce files for dataloaders
def merge_into_master(master_file,new_file, grounds_file):
  master_df_labeled = pd.read_excel(master_file,index_col=0)
  new_df = pd.read_excel(new_file,index_col=0)
  master_df_labeled.update(new_df.true_pos)
  ground_truths_df = produce_ground_truths(master_df_labeled, grounds_file)
  master_df_labeled.to_excel(master_file)
  pickle.dump(master_df_labeled,open("./data/master_df_labeled.p","wb"))
  return master_df_labeled, ground_truths_df

#Produces ground_truths_df
def produce_ground_truths(master_df_labeled, grounds_file):
  ground_truths_df = master_df_labeled[master_df_labeled.true_pos!=-1]
  ground_truths_df.to_excel(grounds_file)
  pickle.dump(ground_truths_df,open("./data/ground_truths_df.p","wb"))
  return ground_truths_df

#Print out how many examples are labeled as True or False Positives
def print_tf_pos(master_df_labeled):
  print(f"Count of true positives: {master_df_labeled[master_df_labeled.true_pos==1].shape[0]}")
  print(f"Count of false positives: {master_df_labeled[master_df_labeled.true_pos==0].shape[0]}")
  assert master_df_labeled[master_df_labeled.problematic==0].shape[0]==0

# Example Workflow
1. Print current true and false positive counts 
2. Create new file of reviews to be labeled (please_label)
3. After labeling, update our running master file (merge_into_master)
4. Print new true and false positive counts to check, also select specific example to check

In [95]:
#Print current true and false positive counts
print_tf_pos(master_df_labeled)

Count of true positives: 52
Count of false positives: 44


In [94]:
#Generate new sample of data to label
out = please_label(master_file, new_file, sample_num=5)
out

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos
108730,"['she', 'nailed', 'our', 'design', 'from', 'th...",communicating,13,1,communicating,-1
119724,"['from', 'quick', 'response', 'times', ',', 't...",excellent,24,1,excellent,-1
98209,"['did', 'a', 'great', 'job', 'on', 'our', 'tea...",great,2,1,great,-1
83321,"['extremely', 'professional', ',', 'flexible',...",technical,19,1,technical,-1
100821,"['excellent', 'work', ',', 'quick', 'responses...",excellent,0,1,excellent,-1


In [109]:
#Update running master_df_labeled and ground_truths_df,
#Print new counts and an example to check
master_df_labeled , ground_truths_df = merge_into_master(master_file, new_file, grounds_file)
print_tf_pos(master_df_labeled)
master_df_labeled.loc[98209,:]

Count of true positives: 52
Count of false positives: 47


review           ['did', 'a', 'great', 'job', 'on', 'our', 'tea...
flagged_word                                                 great
flagged_index                                                    2
problematic                                                      1
lemmatized                                                   great
true_pos                                                         0
Name: 98209, dtype: object

# Initial Labeling Process

In [81]:
master_df = pickle.load(open("./data/master_df.p", "rb"))
master_df["lemmatized"] = [wordnet_lemmatizer.lemmatize(w) for w in master_df.flagged_word]
#
print(len(set(master_df.flagged_word)))
print(len(set(master_df.lemmatized)))
#print(set(master_df.flagged_word)-set(master_df.lemmatized))
# print(sorted(list(set(master_df.lemmatized)),key=str.lower))
# master_df.shape

524
448


In [82]:
#Get counts of reviews for each word
grouped = master_df.groupby(master_df.lemmatized)['review']
counts = pd.DataFrame(grouped.count())


In [75]:
#Store into excel file
counts.sort_values('review',ascending=False).to_excel("./data/counts.xlsx")

In [8]:
# #creating random sample from master_df to label
# sampled = master_df[master_df.problematic==1].sample(500)
# sampled.to_excel("./data/fptp.xlsx")

In [76]:
#merge by row index
labeled = pd.read_excel("./data/fptp_labeled.xlsx",index_col=0)
master_df_labeled = master_df.join(labeled.true_pos).fillna(-1)
master_df_labeled = master_df_labeled[master_df_labeled.problematic==1]


In [11]:
ground_truths_df = master_df_labeled[master_df_labeled.true_pos!=-1]
pickle.dump(ground_truths_df,open("./data/ground_truths_df.p","wb"))

In [15]:
ground_truths_df

Unnamed: 0,review,flagged_word,flagged_index,problematic,true_pos
9133,"[great, job, as, usual-above, and, beyond, cal...",great,0,1,0.0
14637,"[he, is, intelligent, ,, dedicated, ,, passion...",intelligent,2,1,1.0
15243,"[great, work, hire, him, again]",great,0,1,0.0
15541,"[strong, ability, to, execute, feedback, with,...",great,6,1,0.0
16159,"[awesome, +, dependable, designer, !]",dependable,2,1,1.0
...,...,...,...,...,...
135248,"[very, understanding, and, flexible, with, som...",understanding,1,1,1.0
136587,"[the, designer, is, very, intuitive, .]",intuitive,4,1,1.0
137606,"[great, designs, .]",great,0,1,0.0
138055,"[incredibly, detailed, oriented, ,, efficient,...",efficient,4,1,1.0


In [84]:
#master file to continuously update
master_df_labeled.to_excel("./data/master_df_labeled.xlsx")
pickle.dump(master_df_labeled,open("./data/master_df_labeled.p","wb"))