In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('cleaned_non-vectorized_data.csv')
df = df[~df['Text'].isna()]
df

Unnamed: 0,Text,tokens,disgust,joy,anger,surprised,sad,fear,neutral
0,come mert ’ today let u take care lunch enjoy ...,"['come', 'mert', '’', 'today', 'let', 'u', 'ta...",0,0,0,0,0,0,1
1,nxt gt lay 20 staff tech 's latest cutback rb_...,"['nxt', 'gt', 'lay', '20', 'staff', 'tech', ""'...",0,0,0,0,0,0,1
2,layoff 20 workforce 100 employee sf bay area,"['layoff', '20', 'workforce', '100', 'employee...",0,0,0,0,0,0,1
3,today ’ lunch special smoked pork sausage onio...,"['today', '’', 'lunch', 'special', 'smoked', '...",0,0,0,0,0,0,1
4,come mert ’ today grab salmon cake two home co...,"['come', 'mert', '’', 'today', 'grab', 'salmon...",0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
9277,traik01 cdc people warmed u 2 year ago .... sa...,"['traik01', 'cdc', 'people', 'warmed', 'u', '2...",0,0,0,0,1,0,0
9278,sorry ’ promo code share lately 😭 promos autom...,"['sorry', '’', 'promo', 'code', 'share', 'late...",0,0,0,0,1,0,0
9279,poor lad,"['poor', 'lad']",0,0,0,0,1,0,0
9280,one day able bill order tmobile bill sadly tod...,"['one', 'day', 'able', 'bill', 'order', 'tmobi...",0,0,0,0,1,0,0


In [2]:
# Tuning the parameters for tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
ngram_range=(1,1)
max_df =0.8
min_df =1
max_features=None

# create vectorizer
vectorizer =  TfidfVectorizer()
X = vectorizer.fit_transform(df['Text'])
X

<9226x14176 sparse matrix of type '<class 'numpy.float64'>'
	with 107101 stored elements in Compressed Sparse Row format>

In [3]:
from sklearn.decomposition import NMF

n_components = 10
nmf = NMF(n_components=n_components, random_state=42)
topic_document_matrix = nmf.fit_transform(X)
term_topic_matrix = nmf.components_

In [4]:
import numpy as np
v = vectorizer.vocabulary_

for i in range(n_components):
    topic = term_topic_matrix[i]
    s = sorted(topic, reverse=True)[:5]
    top_words=[]
    for val in s:
        max_position = np.where(topic == val)[0][0]
        top_word = list(v.keys())[list(v.values()).index(max_position)]
        top_words.append(top_word)
    print("The top 5 words in topic", i , "is", top_words)

The top 5 words in topic 0 is ['done', 'motus', 'sa_adco', 'motoleksa', 'sa_acd']
The top 5 words in topic 1 is ['driver', 'order', 'food', 'delivery', 'tip']
The top 5 words in topic 2 is ['gazania', 'floral', 'abstract', 'pattern', 'flower']
The top 5 words in topic 3 is ['best', 'content', 'lifestyle', 'download', 'day']
The top 5 words in topic 4 is ['online', 'come', 'order', 'mert', 'today']
The top 5 words in topic 5 is ['code', 'promo', '2022', 'eats', 'delivery']
The top 5 words in topic 6 is ['eats', 'get', 'delivered', 'sanantoniofood', 'safood']
The top 5 words in topic 7 is ['ferobrakes', 'following', 'motus', 'sa_acd', 'liked']
The top 5 words in topic 8 is ['cbsclutch', 'motus', 'done', 'following', 'cbs']
The top 5 words in topic 9 is ['amazon', 'free', 'prime', 'year', 'member']


In [5]:
topics=[]
for doc in topic_document_matrix:
    topic = np.argmax(doc)
    topics.append(topic)
    
df['topic']=topics
df

Unnamed: 0,Text,tokens,disgust,joy,anger,surprised,sad,fear,neutral,topic
0,come mert ’ today let u take care lunch enjoy ...,"['come', 'mert', '’', 'today', 'let', 'u', 'ta...",0,0,0,0,0,0,1,4
1,nxt gt lay 20 staff tech 's latest cutback rb_...,"['nxt', 'gt', 'lay', '20', 'staff', 'tech', ""'...",0,0,0,0,0,0,1,5
2,layoff 20 workforce 100 employee sf bay area,"['layoff', '20', 'workforce', '100', 'employee...",0,0,0,0,0,0,1,5
3,today ’ lunch special smoked pork sausage onio...,"['today', '’', 'lunch', 'special', 'smoked', '...",0,0,0,0,0,0,1,4
4,come mert ’ today grab salmon cake two home co...,"['come', 'mert', '’', 'today', 'grab', 'salmon...",0,0,0,0,0,0,1,4
...,...,...,...,...,...,...,...,...,...,...
9277,traik01 cdc people warmed u 2 year ago .... sa...,"['traik01', 'cdc', 'people', 'warmed', 'u', '2...",0,0,0,0,1,0,0,1
9278,sorry ’ promo code share lately 😭 promos autom...,"['sorry', '’', 'promo', 'code', 'share', 'late...",0,0,0,0,1,0,0,5
9279,poor lad,"['poor', 'lad']",0,0,0,0,1,0,0,1
9280,one day able bill order tmobile bill sadly tod...,"['one', 'day', 'able', 'bill', 'order', 'tmobi...",0,0,0,0,1,0,0,4


In [6]:
df.groupby(['topic']).sample(5).loc[:,['Text','topic']]

Unnamed: 0,Text,topic
7609,ferobrakes done n done motus,0
9079,motoleksa done♥️ motus,0
5474,pastakeith happy recipient 😁 amp pastakeith wi...,0
7650,sa_acd done💯❤️,0
7563,sa_adco done motus,0
7799,1/3 winnipeg deliverydrivers skipthedishes tir...,1
2661,care customer ’ give shit dashers doordashdown...,1
5735,know audience another cheap gimmicky marketing...,1
3745,never forget abomination… abomination obamasna...,1
6031,`` “ sometimes ’ get hour sometimes ’ get le k...,1
