# Libraries

In [1]:
import pandas as pd
import numpy as np
import fasttext

In [2]:
model_path = "../data/bin/lid.176.bin" 

# Loading

In [3]:
ba_reviews_experts = pd.read_parquet('../generated/ba_reviews_experts.parquet')
rb_reviews_experts = pd.read_parquet('../generated/rb_reviews_experts.parquet')
model = fasttext.load_model(model_path)

# Langage Detection

In [4]:
def detect_language_fasttext(text): #utils
    try:
        if pd.isnull(text) or text.strip() == "":
            return "unknown"
        prediction = model.predict(text.strip())
        return prediction[0][0].replace("__label__", "") 
    except Exception as e:
        return "unknown"

## BeerAdvocate

In [5]:
ba_reviews_experts['language'] = ba_reviews_experts['text'].apply(detect_language_fasttext)

In [6]:
ba_languages = ba_reviews_experts.groupby('language').agg(num_languages =('language', 'count'))
ba_total_language = ba_languages['num_languages'].sum()
ba_languages['review_proportion_percentage'] = (ba_languages['num_languages'] / ba_total_language * 100).round(5)
ba_languages = ba_languages.sort_values(by= 'num_languages', ascending= False)

In [7]:
ba_languages

Unnamed: 0_level_0,num_languages,review_proportion_percentage
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,1250674,99.98993
fr,122,0.00975
de,3,0.00024
it,1,8e-05


**Comment:**

### Saving

In [8]:
ba_reviews_experts_en = ba_reviews_experts[ba_reviews_experts['language'] == 'en'].reset_index().copy()
ba_reviews_experts_en.to_parquet('../generated/ba_reviews_experts_en.parquet')

# RateBeer

In [9]:
rb_reviews_experts['language'] = rb_reviews_experts['text'].apply(detect_language_fasttext)

In [10]:
rb_languages = rb_reviews_experts.groupby('language').agg(num_languages =('language', 'count'))
rb_total_language = rb_languages['num_languages'].sum()
rb_languages['review_proportion_percentage'] = (rb_languages['num_languages'] / rb_total_language * 100).round(5)
rb_languages = rb_languages.sort_values(by= 'num_languages', ascending= False)

In [11]:
rb_languages

Unnamed: 0_level_0,num_languages,review_proportion_percentage
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,3412968,95.87487
de,44389,1.24695
fr,36989,1.03907
no,18628,0.52329
sv,11912,0.33462
pl,8238,0.23142
it,8084,0.22709
nl,6990,0.19636
es,6909,0.19408
da,4432,0.1245


**Comment:**

### Saving

In [12]:
rb_reviews_experts_en = rb_reviews_experts[rb_reviews_experts['language'] == 'en'].reset_index().copy()
rb_reviews_experts_en.to_parquet('../generated/rb_reviews_experts_en.parquet')