In [1]:
# Import all necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys


In [2]:
# Load in only the first N rows of the ratings dataset into a pandas dataframe
N = 1000 # number of rows of data to read in
fid = "../../../MatchedBeerData/ratings.csv"
rawdata = pd.read_csv(fid, nrows=N)

# Display the first few rows of the dataframe
print(rawdata.head())

     ba        ba.1   ba.2     ba.3  \
0   abv  appearance  aroma  beer_id   
1  11.3         4.5    4.5      645   
2   5.0         NaN    NaN    28191   
3   5.0         3.5    3.5    57911   
4   5.0         4.0    3.5    57913   

                                            ba.4        ba.5  \
0                                      beer_name  brewery_id   
1                        Trappistes Rochefort 10         207   
2                             Myanmar Lager Beer        9369   
3  Cantillon Tyrnilambic Baie D’Argousier Lambic         388   
4              Cantillon Pikkulinnun Viskilambic         388   

                             ba.6        ba.7     ba.8    ba.9  ...  \
0                    brewery_name        date  overall  palate  ...   
1          Brasserie de Rochefort  1324810800      5.0     4.5  ...   
2  Myanmar Brewery and Distillery  1322650800      NaN     NaN  ...   
3             Brasserie Cantillon  1344074400      4.0     4.0  ...   
4             Brasserie C

In [3]:
# Get rid of the first row and make the second row the header
rawdata.columns = rawdata.iloc[0]
data = rawdata[1:]

# Display the first few rows of the dataframe
print(data.head())

0   abv appearance aroma beer_id  \
1  11.3        4.5   4.5     645   
2   5.0        NaN   NaN   28191   
3   5.0        3.5   3.5   57911   
4   5.0        4.0   3.5   57913   
5   6.0        4.0   4.0   81125   

0                                      beer_name brewery_id  \
1                        Trappistes Rochefort 10        207   
2                             Myanmar Lager Beer       9369   
3  Cantillon Tyrnilambic Baie D’Argousier Lambic        388   
4              Cantillon Pikkulinnun Viskilambic        388   
5     Drie Fonteinen Oude Geuze - Armand & Tommy       2216   

0                    brewery_name        date overall palate  ...  \
1          Brasserie de Rochefort  1324810800     5.0    4.5  ...   
2  Myanmar Brewery and Distillery  1322650800     NaN    NaN  ...   
3             Brasserie Cantillon  1344074400     4.0    4.0  ...   
4             Brasserie Cantillon  1344074400     4.0    4.0  ...   
5           Brouwerij 3 Fonteinen  1346234400     4.0    4.

In [4]:
# Get rid of all rows with NaN values for the 'text' column
COLS_TO_DROP_NANS_FROM = ["text"] # Add more columns to this list if you want to drop rows with NaNs in any of those columns

for column_name in COLS_TO_DROP_NANS_FROM:
    data = data.dropna(subset=column_name)


# Display the first few rows of the dataframe
print(data["text"].head())

0                                               text  \
1  Best before 27.07.2016Directly reviewed in com...   
3  Bottle @ One Pint Pub, Helsinki. 2006 vintage....   
4  Originally rated on 16.11.2009, draught @ Pikk...   
5  750ml bottle, originally rated on 18.8.2012.Bo...   
6  375ml bottle @ Pikkulintu, Helsinki. Originall...   

0                                               text  
1   a)  Geruch malzig-schwer-sÃ¼Ã. Riecht schon ...  
3  Bottle @ One Pint Pub, Helsinki. Originally ra...  
4  Draught @Â Pikkulintu, Helsinki, Finland. A pr...  
5  750ml bottleBottling date: 2011/02/17 - Pours ...  
6  375ml bottle @ Pikkulintu, HelsinkiPours orang...  


Possible Strategies for Sentiment Analysis:
- Rule Based:
    - Use lexicons (words/phrases as tokens) to signal GOOD and BAD 
    - Does not pick up on sarcasm, negation, or idiomatic phrases
- Machine Learning:
    - Naive Bayes (Classification)
        - Possibly the best option for understanding sentiment based on word occurances.
        - Calculates the likelyhood that a new review is + or - based on previous scored reviews.
        - Good on smaller datasets and text data (need to lok into this)
    - Logistic Regression
        - Gives the probability of a binary outcome (GOOD or BAD) but also gives plenty of interpretability through coefficients.
        - Good on large datasets (may overfit without a large dataset)

__Goal__: Understand if words like "fruity" "hoppy" or "bitter" are associated positivly, negativly, or neutral to understand what makes a beer love, hated, polarizing, or neutral. Then we can compare these words accross regions/countries/climates to see how geographical context affects beer tastes. Some deliverables can include word clouds for the first part and heat maps for the second part relating geography to beers with certain descriptors.

__Possible Pipeline__:
- __Preprocess the reviews__: Before using the reviews for training, it’s essential to preprocess them to remove noise and standardize the text. Here are the common preprocessing steps:
    - __Convert to lowercase__: Convert all the reviews to lowercase letters. This step ensures that the model treats words with different cases as the same.
    - __Remove punctuation__: Remove any punctuation marks or special characters from the reviews. Punctuation does not contribute much to sentiment analysis and can be safely removed.
    - __Remove stopwords__: Stopwords are common words such as “the,” “is,” “and,” etc., which do not carry much sentiment. Remove these words from the reviews as they can introduce noise to the model.
    - __Stemming or lemmatization__: Perform stemming or lemmatization to reduce words to their base form. Stemming reduces a word to its root form by removing suffixes, while lemmatization brings words to their dictionary form. This step helps reduce the number of unique words and improves the model’s performance.
    - __Tokenization__: Split the sentences into individual words. This allows you to analyze each word separately and build the word count table later.
    
- __Split the Reviews by Category__:
    - 1–3 stars: Negative sentiment
    - 4–6 stars: Neutral sentiment
    - 7–10 stars: Positive sentiment

- __Visualization of Key Words__:
    - Use word clouds and/or frequency plots to show what are the most likely words and phrases used for positive and negative scores in particular countries.

- __Topic Identification and Distribution__: (Machine Learning)
    - Use Latent Dirichlet Allocation (LDA) to group keywords into topics (ex. Fruity flavor profile, Malty and sweet flavors, Bitter or hoppy profile, Light and smooth texture, Sour or tart flavors). We can then use these topics and their associated keywords and scores to identify what kinds of flavor profiles are most common, most like, most hated, etc. in each region.
    



In [None]:
# Example Naive Bayes implementation from : https://www.geeksforgeeks.org/naive-bayes-vs-logistic-regression-in-machine-learning/

# Importing necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Naive Bayes classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Train Logistic Regression classifier
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)

# Make predictions using both models
nb_predictions = nb_model.predict(X_test)
lr_predictions = lr_model.predict(X_test)

# Evaluate the models
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))
