# Set up: Get data and test df

In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

# Creating list to append tweet data to
tweets_list = []

query='#doordash'

start_date="2022-06-01"

search_term=query+" since:"+start_date

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(search_term).get_items()):
    tweets_list.append([tweet.content, tweet.lang])
    
    
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['Text','Language'])

In [2]:
tweets_df = tweets_df[tweets_df['Language']=='en'] #filter to only tweets with english as the language
test_df = tweets_df.sample(1000)# only use 1000 to save space
test_df = test_df.loc[:,['Text']]#select the text column
test_df

Unnamed: 0,Text
2243,"I browse #Doordash, #Eats, #Postmates #grubhub..."
1830,"DASH Stock Idea\n\n""I would consider entering ..."
268,Today is the BIG day! 🎉OurDoorDash storefront ...
1854,@DoorDash your policy of double billing custom...
3,"🤠🐴If You Are Tired And Hungry, Count On Rancho..."
...,...
1990,Happy National CHEESE DAY!\n\nDid you know we ...
1308,"Ummm #excuse us, we are #now on #doordash #ord..."
1494,My gas prices have gone up $1.60 per gallon in...
1817,Today was actually a really good day! I went d...


In [3]:
# Preprocessing, lemmatization, stop word removal etc. should be done here

# Begin Tool: 

In [4]:
#imports for the tool
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from IPython.display import clear_output
import time

In [5]:
def cluster(test_df, encoded, cat = '0', n_clusters = 2):
    '''
    Copied from MyVoice Data Challenge 2021
    Used to generate clusters (splitting)
    '''
    df = test_df.copy()

    #get index of rows with category
    indices = df[df["category"] == cat].index.tolist()

    # use agglomerative clustering
    clustering = AgglomerativeClustering(distance_threshold = None, n_clusters = n_clusters, 
                                         linkage = "complete", affinity = 'cosine').fit(encoded[indices,:])
    # create sub-categories after the split
    df.loc[indices,"category"] = df.loc[indices,"category"] + "." + (clustering.labels_).astype(str)

    return df

In [6]:
def examine_topics(test_df, branch):
    '''
    returns the text for a certain branch (split)
    '''
    df = test_df.copy()
    df = df[df['category']==branch]
    return df.Text #show the text 

In [7]:
def get_input(i,):
    '''
    checks to make sure the input is correct (y or n)
    '''
    while True:
        try:
            time.sleep(0.1)
            inpu = input(i)
            if inpu!='y' and inpu !='n':
                raise IOError
            else:
                return inpu
        except IOError:
            print("\nSorry, I didn't understand that.")
            continue

In [8]:
def recursive_split(test_df, encoded, cat='0'):
    '''
    recursive function for input, splitting, and labeling
    '''
    #get number of clusters
    while True:
        try:
            time.sleep(0.1)
            n_clusters=int(input('How many cluster do you want?'))
        except ValueError:
            print("\nSorry, I didn't understand that.")
            continue
        else:
            break
    clear_output()

    # call the clusting function
    df = cluster(test_df, encoded, cat = cat, n_clusters=n_clusters)

    d = dict() #dictionary for storing category and labels (i.e. '0.0.1':'negative')
    for i in range(n_clusters):
        branch= cat+'.'+str(i) #new branch name
        print("Looking at cluster: " + branch)
        print(examine_topics(df, branch)) #examine the branch
        
        split = get_input("Would you like to split?")
        print(" ")
        if split =='y':
            df, di = recursive_split(df, encoded, cat=branch) #recursively call this function
            try:
                d={**d, **di} #merge the recurrive dictionary with current dictionary
            except:
                pass
        else:
            sad = get_input('Is this cluster sad?')
            happy = get_input('Is this cluster happy?')
            anger = get_input('Is this cluster angry?')
            surprised = get_input('Is this cluster surprised?')
            disappointment = get_input('Is this cluster disappointed?')
            neutral = get_input('Is this cluster neutral?')

            d[branch] = (sad,happy,anger,surprised,disappointment,neutral) #update dictionary labels
        
        time.sleep(0.1)
        clear_output()
    
    df['labels'] = df['category'].map(d)
    return df, d

In [9]:
def labeling(test_df):
    '''
    puts everything together
    '''
    model = SentenceTransformer('paraphrase-distilroberta-base-v1') #create bert model

    test_df["category"] = '0' #root category
    test_df=test_df.reset_index(drop=True)

    li = test_df['Text'].astype(str).to_list()
    encoded = model.encode(li)#encode the text using the bert model 

    return recursive_split(test_df, encoded, cat='0')

In [10]:
df, d = labeling(test_df)
df #display df

Unnamed: 0,Text,category,labels
0,"I browse #Doordash, #Eats, #Postmates #grubhub...",0.0,"(n, y, n, n, n, n)"
1,"DASH Stock Idea\n\n""I would consider entering ...",0.0,"(n, y, n, n, n, n)"
2,Today is the BIG day! 🎉OurDoorDash storefront ...,0.0,"(n, y, n, n, n, n)"
3,@DoorDash your policy of double billing custom...,0.0,"(n, y, n, n, n, n)"
4,"🤠🐴If You Are Tired And Hungry, Count On Rancho...",0.0,"(n, y, n, n, n, n)"
...,...,...,...
995,Happy National CHEESE DAY!\n\nDid you know we ...,0.0,"(n, y, n, n, n, n)"
996,"Ummm #excuse us, we are #now on #doordash #ord...",0.0,"(n, y, n, n, n, n)"
997,My gas prices have gone up $1.60 per gallon in...,0.0,"(n, y, n, n, n, n)"
998,Today was actually a really good day! I went d...,0.0,"(n, y, n, n, n, n)"
