In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Initializing stemmer and Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing the text (incorporating stemming and lemmatization)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    
    # Removing the stop words and applying both stemming and lemmatization
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Do we want to apply stemming and lemmatization? If we're doing the most basic cosine similarity, it might show the point more if we do 
# super simple bag of words without any changes

# Loading in the scraped beer reviews 
beer_reviews = pd.read_csv('beer_reviews.csv')

# Applyin text preprocessing to all reviews
beer_reviews['Review'] = beer_reviews['Review'].apply(preprocess_text)

# The 3 determined important attributes from Part B
attributes = ["dark", "chocolate", "sweet"]

# Preprocessing the attributes
attributes_str = preprocess_text(' '.join(attributes))

# Initializing the Bag-of-Words Model
vectorizer = CountVectorizer()

# Fit transforming both the reviews and the attributes
review_vectors = vectorizer.fit_transform(beer_reviews['Review'])
attributes_vector = vectorizer.transform([attributes_str])

# Calculating the cosine similarity between each review and the 3 important attributes
sim_scores = cosine_similarity(review_vectors, attributes_vector).flatten()

# Adding the similarity scores as a column to the DataFrame
beer_reviews['similarity_score'] = sim_scores

# Storing the results as a DataFrame
sim_scores_df = beer_reviews[['Beer', 'Review', 'similarity_score']]

# Saving the results to a CSV output file
sim_scores_df.to_csv('similarity_scores.csv', index=False)

# Printing out the first 5 results (for visualization)
print(sim_scores_df.head())

                          Beer  \
0  Kentucky Brunch Brand Stout   
1  Kentucky Brunch Brand Stout   
2  Kentucky Brunch Brand Stout   
3  Kentucky Brunch Brand Stout   
4  Kentucky Brunch Brand Stout   

                                              Review  similarity_score  
0  sampl breweri bottl version beer pour viscou b...          0.174078  
1  perfect barrel age stout overli sweet nice bar...          0.154303  
2  flirtat mapl come crescendo toppl goliath impe...          0.347960  
3  flirtat mapl come crescendo toppl goliath impe...          0.347960  
4  tap tg part kbb releas day rate version perfec...          0.099015  


In [None]:
# Downloading Libraries
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')