# Loading

In [1]:
import pandas as pd
import numpy as np
from langdetect import detect

In [2]:
ba_reviews = pd.read_parquet('../generated/new_ba_reviews.parquet')
rb_reviews = pd.read_parquet('../generated/new_rb_reviews.parquet')

In [3]:
ba_reviews = ba_reviews.drop(['beer_name', 'beer_id', 'brewery_name', 'brewery_id', 'style', 'abv', 'date', 'user_name' ,'appearance', 'aroma', 'palate', 'taste', 'overall', 'rating'], axis= 1)
rb_reviews = rb_reviews.drop(['beer_name', 'beer_id', 'brewery_name', 'brewery_id', 'style', 'abv', 'date', 'user_name' ,'appearance', 'aroma', 'palate', 'taste', 'overall', 'rating'], axis= 1)

In [4]:
ba_users = ba_reviews.groupby('user_id').agg(number_reviews= ('text', 'count')).sort_values(by= 'number_reviews').copy()
rb_users = rb_reviews.groupby('user_id').agg(number_reviews= ('text', 'count')).sort_values(by= 'number_reviews').copy()
ba_experts = ba_users[(ba_users['number_reviews'] >= 500) & (ba_users['number_reviews'] < 501)].reset_index().user_id
rb_experts = rb_users[(rb_users['number_reviews'] >= 500) & (rb_users['number_reviews'] < 501)].reset_index().user_id

In [5]:
ba_reviews_experts = ba_reviews[ba_reviews['user_id'].isin(ba_experts)].copy()
rb_reviews_experts = rb_reviews[rb_reviews['user_id'].isin(rb_experts)].copy()

In [6]:
def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        return "unknown"

In [7]:
ba_reviews_experts['language'] = ba_reviews_experts['text'].apply(detect_language)

In [8]:
rb_reviews_experts['language'] = rb_reviews_experts['text'].apply(detect_language)

In [9]:
print(ba_reviews_experts.user_id.unique(), ba_reviews_experts.language.unique())
print(ba_reviews_experts.user_id.unique(), rb_reviews_experts.language.unique())

['morey.182874' 'kitch.868336' 'blackop555.1183557' 'ggaughan.18442'
 'clavisaurea.232335'] ['en']
['morey.182874' 'kitch.868336' 'blackop555.1183557' 'ggaughan.18442'
 'clavisaurea.232335'] ['en']


In [10]:
ba_reviews_experts_en = ba_reviews_experts.copy()
rb_reviews_experts_en = rb_reviews_experts.copy()

In [12]:
display(ba_reviews_experts_en)
display(rb_reviews_experts_en)

Unnamed: 0,user_id,text,language
25,morey.182874,Pours reddish-brown in color with nice clarity...,en
2782,kitch.868336,"A 330ml foiled and capped bottle, bottle no. 0...",en
4542,blackop555.1183557,"Pours dark brown. Thick, short, light tan head...",en
6005,blackop555.1183557,"Pours dark brown. Thin, foamy, tan head fades ...",en
6232,blackop555.1183557,"Pours reddish brown. Thick, tall, off white he...",en
...,...,...,...
2581112,clavisaurea.232335,12oz pint glass from The Snow Goose Restaurant...,en
2586909,morey.182874,12oz bottle with no freshness date. Pours a ve...,en
2587972,ggaughan.18442,This beer poured a hazy golden color into a pi...,en
2588259,ggaughan.18442,I was looking forward to trying this after lov...,en


Unnamed: 0,user_id,text,language
5559,14135,The 12 ounce bottle poured a crystal clear lig...,en
7300,14135,"The 12 ounce clear bottle poured a clear, gold...",en
7443,14135,The 12 ounce bottle poured a polished pale yel...,en
13605,14135,"The 12 ounce bottle poured a clear, golden col...",en
26861,14135,The bottle poured a crystal clear pale yellow ...,en
...,...,...,...
7064213,71531,"Golden in colour, pours with a thick pillowy h...",en
7065225,71531,"Amber in colour, highly carbonated. Has a good...",en
7065679,71531,An interesting beer this. Certainly in the fla...,en
7066002,71531,An interesting quitre dark amber Belgian style...,en


In [11]:
del ba_reviews, rb_reviews, ba_reviews_experts, rb_reviews_experts

# Vocabulary Analysis

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import string

# Prétraitement (simplifié pour TF-IDF, sans lemmatization ici)
def preprocess_for_tfidf(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text

# Appliquer le prétraitement
ba_reviews_experts_en['clean_text'] = ba_reviews_experts_en['text'].apply(preprocess_for_tfidf)
rb_reviews_experts_en['clean_text'] = rb_reviews_experts_en['text'].apply(preprocess_for_tfidf)

# Combine les deux datasets pour le calcul global
all_reviews = pd.concat([
    ba_reviews_experts_en[['clean_text']].assign(platform='BeerAdvocate'),
    rb_reviews_experts_en[['clean_text']].assign(platform='RateBeer')
])

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # 1000 mots les plus importants
tfidf_matrix = tfidf_vectorizer.fit_transform(all_reviews['clean_text'])

# Convertir en DataFrame pour inspection
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out(),
    index=all_reviews.index
)

# Ajouter la plateforme pour différencier
tfidf_df['platform'] = all_reviews['platform'].values

# Résumé
print(tfidf_df.head())


       10  10th   11  112   12  12oz  1st  2008  2011  2012  ...  wrong  year  \
25    0.0   0.0  0.0  0.0  0.0   0.0  0.0   0.0   0.0   0.0  ...    0.0   0.0   
2782  0.0   0.0  0.0  0.0  0.0   0.0  0.0   0.0   0.0   0.0  ...    0.0   0.0   
4542  0.0   0.0  0.0  0.0  0.0   0.0  0.0   0.0   0.0   0.0  ...    0.0   0.0   
6005  0.0   0.0  0.0  0.0  0.0   0.0  0.0   0.0   0.0   0.0  ...    0.0   0.0   
6232  0.0   0.0  0.0  0.0  0.0   0.0  0.0   0.0   0.0   0.0  ...    0.0   0.0   

         years  yeast  yeasty  yellow  yelloworange  youre  zest      platform  
25    0.000000    0.0     0.0     0.0           0.0    0.0   0.0  BeerAdvocate  
2782  0.129324    0.0     0.0     0.0           0.0    0.0   0.0  BeerAdvocate  
4542  0.000000    0.0     0.0     0.0           0.0    0.0   0.0  BeerAdvocate  
6005  0.000000    0.0     0.0     0.0           0.0    0.0   0.0  BeerAdvocate  
6232  0.000000    0.0     0.0     0.0           0.0    0.0   0.0  BeerAdvocate  

[5 rows x 1001 columns]
