Here we provide an overview of the models that we will use for the project.
The goal with this notebook is to play with the data and see how the models perform.

The main goal is to provide a pipeline that can be used to analyze the reviews, to be able to answer our first research question. Tasks are : 
1) what is the sentiment of the review
2) what styles of beers exist
3) what emotions a beer is triggering

## Sentiment analysis

In [8]:
from src.models.sentiment_analysis_model import SentimentAnalysisPipeline

sentiment_analysis = SentimentAnalysisPipeline()
print(sentiment_analysis.predict("I love this beer"))
print(sentiment_analysis.predict("I hate this beer"))

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POS', 'score': 0.9920633435249329}]
[{'label': 'NEG', 'score': 0.9812145233154297}]


## Topics analysis

In [13]:
from src.models.lda_topics_analysis_model import LDAAnalysis

# sample corpus with two main topics : computer science and health

reviews = ["The computer network relies on secure software for data protection.",
           "A balanced diet and regular exercise are key to good health.",
           "Programming AI systems requires a deep understanding of data.",
           "Medicine and treatment plans are developed by the doctor for wellness.",
           "Internet connectivity is essential for accessing modern software tools.",
           "Exercise improves overall wellness and complements a healthy diet."
           ]

lda_analysis = LDAAnalysis(reviews)
lda_analysis.load_dataset()
lda_analysis.preprocess()
lda_analysis.train_lda()
lda_analysis.print_topics(num_words=2)
print("as we can see the output makes sense we have a first topic about software and a second topic about exercise")

Loaded dataset with 6 reviews.
starting preprocess
preprocessing completed
LDA model training completed.
[(0, '0.816*"software" + 0.184*"exercise"'),
 (1, '0.815*"exercise" + 0.185*"software"')]
as we can see the output makes sense we have a first topic about software and a second topic about exercise


## Emotion analysis (embeddings)

In [14]:
from src.models.emotions_analysis_model import EmotionsAnalysisPipeline

EmotionsAnalysisPipeline().analyse("This beer is very surprising, I didn't expect it to be so good.")

surprise                      : 0.3949018120765686
disgust                       : 0.16275370121002197
happiness                     : 0.11954759806394577
sadness                       : 0.11160004138946533
anger                         : 0.09775044023990631
fear                          : 0.06247026473283768
neutral                       : 0.10074402391910553


In [10]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

import sys
import os
data_path = os.path.abspath('../data')
sys.path.append(data_path)

import reviews_processing

In [2]:
%load_ext autoreload
%autoreload 2

In [19]:
sentiment_path = "/Users/marijazelic/Downloads/reviews2_df.pkl"
users_path = "/Users/marijazelic/Downloads/users.csv"

users_reviews = reviews_processing.Reviews(users_path, sentiment_path)
all_states=False
reviews_style = users_reviews.filter_beer_type()

In [None]:
reviews[reviews['']]

In [26]:
# Group by state, year, style and sentiment_label (positive / negative) and count per each
reviews_style_grouped_by = reviews_style.groupby(by=['state', 'year', 'general_style', 'sentiment_label'], group_keys=True).size().reset_index(name='count')

# Move sentiment_label to columns
pivot = pd.pivot_table(reviews_style_grouped_by, values='count', index=['state', 'year', 'general_style'], columns='sentiment_label').reset_index()

# Fill NaN values with 0
fill_nan = pivot.fillna(value=0)

# Normalize count of positive and negative labels
fill_nan['NEGATIVE'] = fill_nan['NEGATIVE'] / (fill_nan['NEGATIVE'] + fill_nan['POSITIVE'])
fill_nan['POSITIVE'] = fill_nan['POSITIVE'] / (fill_nan['NEGATIVE'] + fill_nan['POSITIVE'])

# Boolean to chose states
if not all_states:
    states = ['New York', 'California', 'New Hampshire', 'Wisconsin', 'Nevada', 'Pennsylvania', 'Virginia', 'Ohio', 'Florida', 'North Carolina', 'Arizona', 'Indiana', 'Georgia', 'Texas', 'South Carolina', 'Iowa', 'Kentucky']
    # Extract reviews coming from these states and from the specified year 
    filter_states = fill_nan[fill_nan['state'].isin(states)]
    
else:
    # Use all states
    filter_states = fill_nan

In [11]:
year_list = list(np.arange(2004, 2017, 1, dtype=int))
per_sentiment_filt = per_sentiment[per_sentiment['year'].isin(year_list)]

In [17]:
per_sentiment_filt.head()

sentiment_label,state,year,general_style,NEGATIVE,POSITIVE
252,Arizona,2004,IPA,0.073394,0.999274
253,Arizona,2004,Lager,0.466667,0.99356
254,Arizona,2004,Other Ale,0.119318,0.999231
255,Arizona,2004,Pale Ale,0.210884,0.998185
256,Arizona,2004,Pilsner,0.285714,0.988701


In [16]:
# Get unique years and beer styles
df = per_sentiment_filt
years = sorted(df['year'].unique())
beer_styles = df['general_style'].unique()

# Create a subplot layout: 4x2 grid
fig = make_subplots(
    rows=4, cols=2, 
    subplot_titles=beer_styles,
    shared_xaxes=True
)

# Create traces for each year for each beer style
# Store traces in a dictionary for easier management
traces = {year: [] for year in years}

for year in years:
    filtered_df = df[df['year'] == year]
    row, col = 1, 1
    for style in beer_styles:
        style_df = filtered_df[filtered_df['general_style'] == style]

        # Add POSITIVE bar
        traces[year].append(go.Bar(
            x=style_df['state'],
            y=style_df['POSITIVE'],
            name='POSITIVE',
            marker_color='green',
            showlegend=True  # Show legend only in the first subplot
        ))

        # Add NEGATIVE bar
        traces[year].append(go.Bar(
            x=style_df['state'],
            y=style_df['NEGATIVE'],
            name='NEGATIVE',
            marker_color='red',
            showlegend=False
        ))

        # Add the traces to the subplot
        fig.add_trace(traces[year][-2], row=row, col=col)
        fig.add_trace(traces[year][-1], row=row, col=col)

        # Update subplot positions
        col += 1
        if col > 2:
            col = 1
            row += 1

# Set up slider steps for each year
steps = []
for year in years:
    step = dict(
        method="update",
        args=[
            {"visible": [False] * len(years) * len(beer_styles) * 2},  # Start with all traces invisible
            {"title": f"Sentiment Analysis by Beer Style for {year}"}
        ],
    )

    # Make current year's traces visible
    for i in range(len(beer_styles) * 2):
        step["args"][0]["visible"][i + (years.index(year) * len(beer_styles) * 2)] = True

    steps.append(step)

# Create slider
sliders = [dict(
    active=years.index(2004),
    currentvalue={"prefix": "Year: "},
    steps=steps
)]

# Update layout
fig.update_layout(
    sliders=sliders,
    barmode='stack',
    height=1000,  # Adjust for 4x2 layout
    template="plotly_white",
    title=f"Sentiment Analysis by Beer Style for {years[0]}"
)

# Show the figure
fig.show()