In [1]:
from collections import Counter

import transformers
import pandas as pd
from lbl2vec import Lbl2TransformerVec

In [6]:
data = pd.read_csv("../data/processed/reviews.csv")
text = list(data["partially_cleaned_text"])
labels = [['drinks', 'tea', 'coffee', 'juice', 'soda'], 
          ['snacks', 'nuts', 'chips', 'crackers', 'cereal', 'candy'], 
          ['ingredients', 'sugar', 'salt', 'oil', 'seasoning', 'spices'],
          ['baked goods', 'pastries', 'cookies', 'cocoa', 'chocolate'], 
          ['peanut butter', 'almond butter', 'apple butter'], #maybe add sauces??
          ['noodles',  'pasta', 'ramen', 'udon'],
          ['pet', 'dog', 'cat']]
labels_num = {0: 'drinks',
              1: 'snacks',
              2: 'ingredients',
              3: 'baked goods',
              4: 'spreads',
              5: 'noodles & pasta',
              6: 'pets'}

# Lbl2Vec



In [3]:
model = Lbl2TransformerVec(keywords_list=labels, documents=text)

# train model
model.fit()

preds = model.predict_model_docs()

2023-03-23 14:37:40,101 - Lbl2TransformerVec - INFO - Compute keyword embeddings
2023-03-23 14:37:40,620 - Lbl2TransformerVec - INFO - Compute document embeddings
2023-03-23 14:43:20,179 - Lbl2TransformerVec - INFO - Train label embeddings
2023-03-23 14:43:21,646 - Lbl2TransformerVec - INFO - Get document embeddings from model
2023-03-23 14:43:21,647 - Lbl2TransformerVec - INFO - Calculate document<->label similarities


In [10]:
data["pred_topic"] = preds['most_similar_label']
data["score"] = preds['highest_similarity_score']

In [11]:
data["pred_topic_label"] = data["pred_topic"].apply(lambda x: labels_num[int(x[-1])])

In [12]:
data

Unnamed: 0,date,partially_cleaned_text,sentiment,cleaned_text,pred_topic,score,pred_topic_label
0,18/6/21,This is a very healthy dog food. Good for thei...,1,healthy dog food good digestion also good smal...,label_2,0.388768,ingredients
1,7/7/21,I've been very pleased with the Natural Balanc...,1,pleased natural balance dog food dogs issues d...,label_2,0.352458,ingredients
2,18/6/21,"Before I was educated about feline nutrition, ...",1,educated feline nutrition allowed cats become ...,label_5,0.418897,noodles & pasta
3,7/7/21,"My holistic vet recommended this, along with a...",1,holistic vet recommended along brands tried ca...,label_6,0.473010,pets
4,1/7/21,I bought this coffee because its much cheaper ...,1,bought coffee much cheaper ganocafe organic re...,label_5,0.463735,noodles & pasta
...,...,...,...,...,...,...,...
5439,26/2/21,"This is an okay gift box, only if you like med...",0,okay gift box like mediocre cheese summer saus...,label_5,0.332107,noodles & pasta
5440,18/12/19,It looks llike I just walked into a raw deal. ...,0,looks llike walked raw deal item intolerably s...,label_2,0.409528,ingredients
5441,19/1/20,Thank god that i tasted the metal before i swa...,0,thank god tasted metal swallowed even get got ...,label_3,0.509871,baked goods
5442,13/9/20,This product was very good when I began buying...,0,product good began buying lately terrible tast...,label_0,0.494100,drinks


In [13]:
data.to_excel("predictions_lbl2vec_2.xlsx")

In [51]:
import itertools

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text


def get_top_words(corpus):

    vec = TfidfVectorizer(stop_words = my_stop_words).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return pd.DataFrame(words_freq[:6], columns = ["top words", "count"])

def visualise_top_words(df, n_topics):

    colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])

    #subplot_titles = [f"Topic {topic}" for topic in range(n_topics)]
    subplot_titles = [labels_num[topic] for topic in range(n_topics)]
    columns = 4
    rows = int(np.ceil(n_topics/columns))
    fig = make_subplots(rows=rows,
                        cols=columns,
                        shared_xaxes=False,
                        horizontal_spacing=.1,
                        vertical_spacing=.4 / rows if rows > 1 else 0,
                        subplot_titles=subplot_titles)

    row = 1
    column = 1
    for topic in range(n_topics):
        topic_corpus = df[df["pred_topic_label"] == labels_num[topic]]
        freq_df = get_top_words(topic_corpus["cleaned_text"])

        fig.add_trace(
            go.Bar(x = freq_df["count"],
                   y= freq_df["top words"],
                   orientation='h',
                   marker_color=next(colors)),
            row=row, col=column)

        if column == columns:
            column = 1
            row += 1
        else:
            column += 1

    fig.update_layout(
        template="plotly_white",
        showlegend=False,
        title={
            'text': "Top Words",
            'x': .5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=1000,
        height=250*rows if rows > 1 else 250 * 1.3,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    return fig

custom_sw = ['great', 'taste', 'good', 'like', 'product', 'flavor', 'love', 'really', 'buy', 'tastes', 'better', 'best', 'tried', 'use', 'eat', 'food', 'make']
my_stop_words = list(text.ENGLISH_STOP_WORDS.union(custom_sw))

In [52]:
visualise_top_words(data, 7)