In [1]:
#imports for the tool
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from IPython.display import clear_output
import time

In [2]:
import pandas as pd
test_df = pd.read_csv('doordash_unlabeled.csv')

In [3]:
def cluster(test_df, encoded, cat = 'others', n_clusters = 2):
    '''
    Copied from MyVoice Data Challenge 2021
    Used to generate clusters (splitting)
    '''
    df = test_df.copy()

    #get index of rows with category
    indices = df[df["category"] == cat].index.tolist()

    # use agglomerative clustering
    clustering = AgglomerativeClustering(distance_threshold = None, n_clusters = n_clusters, 
                                         linkage = "complete", affinity = 'cosine').fit(encoded[indices,:])
    # create sub-categories after the split
    df.loc[indices,"category"] = df.loc[indices,"category"] + "." + (clustering.labels_).astype(str)
    
    return df

In [4]:
def examine_topics(test_df, branch):
    '''
    returns the text for a certain branch (split)
    '''
    df = test_df.copy()
    df = df[df['category']==branch]
    return df.Text #show the text 

In [5]:
def get_input(i):
    '''
    checks to make sure the input is correct (y or n)
    '''
    while True:
        try:
            time.sleep(0.1)
            inpu = input(i)
            if inpu!='y' and inpu !='n':
                raise IOError
            else:
                return inpu
        except IOError:
            print("\nSorry, I didn't understand that.")
            continue

In [6]:
def recursive_split(test_df, encoded, cat='others'):
    '''
    recursive function for input, splitting, and labeling
    '''

    d = dict() #dictionary for storing category and labels (i.e. '0.0.1':'negative')
    df = test_df.copy()
    
    print("Looking at cluster: " + cat)
    print(examine_topics(df, cat)) #examine the branch

    split = get_input("Would you like to split?")
    print(" ")
    if split =='y':

        #get number of clusters
        while True:
            try:
                time.sleep(0.1)
                n_clusters=int(input('How many cluster do you want?'))
            except ValueError:
                print("\nSorry, I didn't understand that.")
                continue
            else:
                break
        clear_output()

        # call the clusting function
        df = cluster(test_df, encoded, cat = cat, n_clusters=n_clusters)
        
        for i in range(n_clusters):
            branch= cat+'.'+str(i) #new branch name
            df, di = recursive_split(df, encoded, cat=branch) #recursively call this function
            try:
                d={**d, **di} #merge the recurrive dictionary with current dictionary
            except:
                pass
    else:
        
        disgust = get_input('Is this cluster disgust?')
        happy = get_input('Is this cluster happy?')
        anger = get_input('Is this cluster angry?')
        surprised = get_input('Is this cluster surprised?')
        sad = get_input('Is this cluster sad?')
        fear = get_input('Is this cluster fear?')
        neutral = get_input('Is this cluster neutral?')

        try:
            d[branch] = (disgust,happy,anger,surprised,sad,fear,neutral) #update dictionary labels
        except:
            d[cat] = (disgust,happy,anger,surprised,sad,fear,neutral) #update dictionary labels
        
        ##end of for loop
        time.sleep(0.1)
        clear_output()
    
    df['labels'] = df['category'].map(d)
    return df, d

In [7]:
def labeling(test_df):
    '''
    puts everything together
    '''
    model = SentenceTransformer('paraphrase-distilroberta-base-v1') #create bert model

    test_df["category"] = test_df["emotions"] #root category
    test_df=test_df.reset_index(drop=True)

    li = test_df['Text'].astype(str).to_list()
    encoded = model.encode(li)#encode the text using the bert model 

    df_list=[]
    clear_output()
    for emotion in test_df['emotions'].unique():
        df, d = recursive_split(test_df, encoded, cat=emotion)
        df=df[df['emotions']==emotion]
        df_list.append(df)

    df = pd.concat(df_list)
    return df

In [8]:
df = labeling(test_df)
df #display df

Unnamed: 0.1,Unnamed: 0,Text,emotions,category,labels
0,1,$6 for 15 miles I’m out here trying to make it...,others,others.0,"(n, n, y, n, n, n, n)"
1,2,TURANOS PIZZA KITCHEN 🍕 #DOORDASH ORDERS 🎒 #GO...,others,others.0,"(n, n, y, n, n, n, n)"
2,3,Let us know if you need a car to rent by the h...,others,others.0,"(n, n, y, n, n, n, n)"
4,5,"If you’re ordering #doordash, tip as you would...",others,others.1.0,"(n, n, n, n, n, n, y)"
6,7,took today off don't want to drive the vehicle...,others,others.0,"(n, n, y, n, n, n, n)"
...,...,...,...,...,...
2202,2438,#DoorDash #Driver Shot to Death While #Deliver...,sadness,sadness,"(n, n, n, n, y, n, n)"
2910,3216,I got carsick on the way to work and it wasn’t...,sadness,sadness,"(n, n, n, n, y, n, n)"
2959,3275,"Welp, we did it 😔 #uber #doordash https://t.co...",sadness,sadness,"(n, n, n, n, y, n, n)"
3116,3452,I thought I was going to make a few extra doll...,sadness,sadness,"(n, n, n, n, y, n, n)"


In [9]:
#convert to csv
df.to_csv('doordash_labeled.csv')