In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import time
import os

In [3]:
import fasttext

# Language detection

## BeerAdvocate

In [4]:
ba_reviews = pd.read_parquet('../generated/ba_reviews_experts.parquet')

In [5]:
model_path = "lid.176.bin" 
output_file = './output_ba_reviews_languages.csv'

model = fasttext.load_model(model_path)

def detect_language_fasttext(text):
    try:
        if pd.isnull(text) or text.strip() == "":
            return "unknown"
        prediction = model.predict(text.strip())
        return prediction[0][0].replace("__label__", "") 
    except Exception as e:
        return "unknown"

ba_reviews["language"] = ba_reviews["text"].apply(detect_language_fasttext)

ba_reviews.to_csv(output_file, index=False)

In [11]:
language_distribution = ba_reviews["language"].value_counts()
language_distribution_df = language_distribution.reset_index()
language_distribution_df.columns = ["language", "count"]

# Calculate percentages
total_reviews = language_distribution_df["count"].sum()
language_distribution_df["percentage"] = (language_distribution_df["count"] / total_reviews * 100).round(2)

display(language_distribution_df)


Unnamed: 0,language,count,percentage
0,en,1250674,99.99
1,fr,122,0.01
2,de,3,0.0
3,it,1,0.0


In [19]:
#Filter reviews that are in English
output_dir = "../generated"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "ba_reviews_experts_en.csv")

english_reviews = ba_reviews[ba_reviews["language"] == "en"]

english_reviews.to_csv(output_file, index=False)

print(f"Dataset with English reviews saved to: {output_file}")


Dataset with English reviews saved to: ../generated/ba_reviews_experts_en.csv


## Ratebeer

In [8]:
rb_reviews = pd.read_parquet('../generated/rb_reviews_experts.parquet')

In [9]:
model_path = "lid.176.bin" 
output_file = './output_rb_reviews_languages.csv'

model = fasttext.load_model(model_path)

def detect_language_fasttext(text):
    try:
        if pd.isnull(text) or text.strip() == "":
            return "unknown"
        prediction = model.predict(text.strip())
        return prediction[0][0].replace("__label__", "") 
    except Exception as e:
        return "unknown"

rb_reviews["language"] = rb_reviews["text"].apply(detect_language_fasttext)

rb_reviews.to_csv(output_file, index=False)

In [16]:
language_distribution = rb_reviews["language"].value_counts()
language_distribution_df = language_distribution.reset_index()
language_distribution_df.columns = ["language", "count"]

# Calculate percentages
total_reviews = language_distribution_df["count"].sum()
language_distribution_df["percentage"] = (language_distribution_df["count"] / total_reviews * 100).round(2)

display(language_distribution_df)

Unnamed: 0,language,count,percentage
0,en,3412968,95.87
1,de,44389,1.25
2,fr,36989,1.04
3,no,18628,0.52
4,sv,11912,0.33
5,pl,8238,0.23
6,it,8084,0.23
7,nl,6990,0.2
8,es,6909,0.19
9,da,4432,0.12


In [20]:
output_dir = "../generated"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "rb_reviews_experts_en.csv")

english_reviews_rb = rb_reviews[rb_reviews["language"] == "en"]

english_reviews_rb.to_csv(output_file, index=False)

print(f"Dataset with English reviews saved to: {output_file}")

Dataset with English reviews saved to: ../generated/rb_reviews_experts_en.csv
