In [1]:
#imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

#read csv
df = pd.read_csv('cleaned_non-vectorized_serve_data.csv')
df = df[~df['text'].isna()]
df

Unnamed: 0,text,company,created_at,tokens
0,lowertown wine amp spirit delivers 4pm free see,grubhub,Mon Jul 18 15:59:52 +0000 2022,"['lowertown', 'wine', 'amp', 'spirit', 'delive..."
1,maureentiti hi thank contacting 're sorry inco...,grubhub,Mon Jul 18 15:46:14 +0000 2022,"['maureentiti', 'hi', 'thank', 'contacting', ""..."
2,“ …in recent month ride-sharing company operat...,grubhub,Mon Jul 18 15:29:13 +0000 2022,"['“', '…in', 'recent', 'month', 'ride-sharing'..."
3,rt brandnewmcmann guy yet thatsmycatchphrase,grubhub,Mon Jul 18 15:06:53 +0000 2022,"['rt', 'brandnewmcmann', 'guy', 'yet', 'thatsm..."
4,maureentiti hi thank contacting could please d...,grubhub,Mon Jul 18 15:39:58 +0000 2022,"['maureentiti', 'hi', 'thank', 'contacting', '..."
...,...,...,...,...
745,work thing get order n't drink take 10 second ...,doordash,Mon Aug 01 00:53:25 +0000 2022,"['work', 'thing', 'get', 'order', ""n't"", 'drin..."
746,work thing get order n't drink take 10 second ...,ubereats,Mon Aug 01 00:53:25 +0000 2022,"['work', 'thing', 'get', 'order', ""n't"", 'drin..."
747,well wont use cuz get hacked easily ppl take r...,grubhub,Mon Jul 25 00:49:27 +0000 2022,"['well', 'wont', 'use', 'cuz', 'get', 'hacked'..."
748,well wont use cuz get hacked easily ppl take r...,doordash,Mon Jul 25 00:49:27 +0000 2022,"['well', 'wont', 'use', 'cuz', 'get', 'hacked'..."


In [2]:
# Tuning the parameters for tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
ngram_range=(1,1)
max_df =0.8
min_df =1
max_features=None

# create vectorizer
vectorizer =  TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
X

<749x2862 sparse matrix of type '<class 'numpy.float64'>'
	with 8474 stored elements in Compressed Sparse Row format>

In [3]:
from sklearn.decomposition import NMF

#arbitrary number of components
n_components = 10

# factor the vectorized matrix into topic-document matrix and term-topic matrix
nmf = NMF(n_components=n_components, random_state=42)
topic_document_matrix = nmf.fit_transform(X)
term_topic_matrix = nmf.components_

In [4]:
import numpy as np
v = vectorizer.vocabulary_

topic_words=[]
for i in range(n_components):
    topic = term_topic_matrix[i] #extract the topic
    s = sorted(topic, reverse=True)[:10] #get highest scores
    top_words=[]
    for val in s: #loop though highest scores
        max_position = np.where(topic == val)[0][0] #position of term with highest score
        top_word = list(v.keys())[list(v.values()).index(max_position)] #get the top word
        top_words.append(top_word) #append top word
    print("The top 5 words in topic", i , "is", top_words)
    topic_words.append(top_words)

The top 5 words in topic 0 is ['order', 'get', 'food', 'driver', 'use', 'first', '10', 'refund', 'never', 'take']
The top 5 words in topic 1 is ['amp', 'donut', 'guy', 'area', 'la', 'wwe_mandyrose', 'cream', 'cooky', 'tried', 'available']
The top 5 words in topic 2 is ['ran', 'allegedly', 'allegedly', 'allegedly', 'allegedly', 'allegedly', 'allegedly', 'allegedly', 'allegedly', 'jersey']
The top 5 words in topic 3 is ['amazon', 'amid', 'amid', 'amid', 'amid', 'amid', 'amid', 'earnings', 'must', 'marketwatch']
The top 5 words in topic 4 is ['maxehrich', 'late', 'sleep', 'sleep', 'lol', 'amp', 'rt', 'night', 'love', 'upscalebutta']
The top 5 words in topic 5 is ['please', 'dm', 'number', 'email', 'account', 'address', 'look', 'phone', 'sorry', 'send']
The top 5 words in topic 6 is ['yes', 'age', 'age', 'impressive', 'instant', 'etc', 'generalwarphare', 'rt', 'today', 'money']
The top 5 words in topic 7 is ['prime', 'free', 'fee', 'eligible', 'delivery', 'grubhub', 'one', 'member', 'year'

In [5]:
topic_list=[] #Topic0, Topic1, ...
for i in range(10):
    topic_list.append('Topic'+str(i))

term_list=[] #Term1, Term2, ...
for i in range(1,11):
    term_list.append('Term'+str(i))

In [6]:
topic_df = pd.DataFrame(topic_words, index=topic_list, columns=term_list)
topic_df

Unnamed: 0,Term1,Term2,Term3,Term4,Term5,Term6,Term7,Term8,Term9,Term10
Topic0,order,get,food,driver,use,first,10,refund,never,take
Topic1,amp,donut,guy,area,la,wwe_mandyrose,cream,cooky,tried,available
Topic2,ran,allegedly,allegedly,allegedly,allegedly,allegedly,allegedly,allegedly,allegedly,jersey
Topic3,amazon,amid,amid,amid,amid,amid,amid,earnings,must,marketwatch
Topic4,maxehrich,late,sleep,sleep,lol,amp,rt,night,love,upscalebutta
Topic5,please,dm,number,email,account,address,look,phone,sorry,send
Topic6,yes,age,age,impressive,instant,etc,generalwarphare,rt,today,money
Topic7,prime,free,fee,eligible,delivery,grubhub,one,member,year,favorite
Topic8,ever,like,lmao,coldest,coldest,coldest,coldest,coldest,line,twitter
Topic9,rt,ailsaforshaw,chang,got,would,done,worker,man,nice,fired


In [7]:
topic_df.to_csv('topic_terms.csv')

### Add topics back into original df and do sampling  for each topic

In [8]:
topics=[]
for doc in topic_document_matrix:
    topic = np.argmax(doc)#get topic
    topics.append(topic)
    
df['topic']=topics
df

Unnamed: 0,text,company,created_at,tokens,topic
0,lowertown wine amp spirit delivers 4pm free see,grubhub,Mon Jul 18 15:59:52 +0000 2022,"['lowertown', 'wine', 'amp', 'spirit', 'delive...",1
1,maureentiti hi thank contacting 're sorry inco...,grubhub,Mon Jul 18 15:46:14 +0000 2022,"['maureentiti', 'hi', 'thank', 'contacting', ""...",5
2,“ …in recent month ride-sharing company operat...,grubhub,Mon Jul 18 15:29:13 +0000 2022,"['“', '…in', 'recent', 'month', 'ride-sharing'...",2
3,rt brandnewmcmann guy yet thatsmycatchphrase,grubhub,Mon Jul 18 15:06:53 +0000 2022,"['rt', 'brandnewmcmann', 'guy', 'yet', 'thatsm...",9
4,maureentiti hi thank contacting could please d...,grubhub,Mon Jul 18 15:39:58 +0000 2022,"['maureentiti', 'hi', 'thank', 'contacting', '...",5
...,...,...,...,...,...
745,work thing get order n't drink take 10 second ...,doordash,Mon Aug 01 00:53:25 +0000 2022,"['work', 'thing', 'get', 'order', ""n't"", 'drin...",0
746,work thing get order n't drink take 10 second ...,ubereats,Mon Aug 01 00:53:25 +0000 2022,"['work', 'thing', 'get', 'order', ""n't"", 'drin...",0
747,well wont use cuz get hacked easily ppl take r...,grubhub,Mon Jul 25 00:49:27 +0000 2022,"['well', 'wont', 'use', 'cuz', 'get', 'hacked'...",0
748,well wont use cuz get hacked easily ppl take r...,doordash,Mon Jul 25 00:49:27 +0000 2022,"['well', 'wont', 'use', 'cuz', 'get', 'hacked'...",0


In [9]:
df.to_csv('tweets_with_topics.csv')