# Libraries

In [1]:
import pandas as pd
import numpy as np
import fasttext

In [2]:
model_path = "../data/bin/lid.176.bin" 

# Loading

In [15]:
ba_reviews_experts = pd.read_parquet('../generated/ba_reviews_experts.parquet')
rb_reviews_experts = pd.read_parquet('../generated/rb_reviews_experts.parquet')
model = fasttext.load_model(model_path)

# Langage Detection

In [11]:
def detect_language_fasttext(text): #utils
    try:
        if pd.isnull(text) or text.strip() == "":
            return "unknown"
        print(text.strip())
        prediction = model.predict(text.strip())
        return prediction[0][0].replace("__label__", "") 
    except Exception as e:
        return "unknown"

## BeerAdvocate

In [16]:
ba_reviews_experts

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 11:00:00,StJamesGate,stjamesgate.163714,3.0,3.50,3.5,4.0,3.50,3.67,Pours pale copper with a thin head that quickl...
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 11:00:00,mdagnew,mdagnew.19527,4.0,3.50,3.5,4.0,3.50,3.73,"500ml Bottle bought from The Vintage, Antrim....."
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 10:00:00,cypressbob,cypressbob.3708,4.0,4.00,4.0,4.0,4.00,4.00,"500ml bottlePours with a light, slightly hazy ..."
6,Legbiter,19827,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.8,2013-01-09 11:00:00,AgentMunky,agentmunky.409755,4.0,3.75,3.5,3.5,3.75,3.64,Poured from a 12 ounce bottle into a pilsner g...
8,Legbiter,19827,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.8,2012-07-14 10:00:00,OtherShoe2,othershoe2.233642,4.0,3.50,4.0,3.5,4.00,3.68,Pours a rich brownish red with some chill haze...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2589564,Witch's Hair Pumpkin Ale,64097,Twisted Manzanita Ales,23640,Pumpkin Ale,9.5,2012-10-26 10:00:00,mactrail,mactrail.311921,3.5,4.00,3.0,4.0,3.00,3.67,I can't top my review last year of this stuff ...
2589565,Witch's Hair Pumpkin Ale,64097,Twisted Manzanita Ales,23640,Pumpkin Ale,9.5,2012-10-24 10:00:00,srandycarter,srandycarter.91008,3.5,5.00,4.5,4.0,4.00,4.26,"A - Pour medium dark honey, not much head, but..."
2589567,Witch's Hair Pumpkin Ale,64097,Twisted Manzanita Ales,23640,Pumpkin Ale,9.5,2011-11-03 11:00:00,HalfFull,halffull.196628,3.5,3.50,3.0,3.0,3.00,3.15,I pour this rather pricy bomber into a globe g...
2589568,Witch's Hair Pumpkin Ale,64097,Twisted Manzanita Ales,23640,Pumpkin Ale,9.5,2011-10-31 11:00:00,HopHead84,hophead84.109437,3.0,2.50,2.0,2.0,2.00,2.18,"10/30/2011Bottle shared by Charles, thanks!A: ..."


In [12]:
ba_reviews_experts['language'] = ba_reviews_experts['text'].apply(detect_language_fasttext)

Pours pale copper with a thin head that quickly goes. Caramel, golden syrup nose. Taste is big toasty, grassy hops backed by dark fruit, candy corn and brack malts. Clingy. Dries out at the end with more hops. Brave, more going on that usual for this type.
500ml Bottle bought from The Vintage, Antrim...Poured a golden yellow / orange colour... White head poured quite thick and foamy and faded to thin layer...Aroma - Fruity (burnt orange, some apple hints), light maltiness, spicy hops, vanilla, some sea saltiness...Taste - Spicy / peppery hop notes, citrusy, light sweetness, grassy, slight creaminess, some bready notes...Feel - Quite sharp and pretty dry. Light body.... Pretty drinkable...Overall - A pretty good beer.... worth a try...


In [13]:
ba_languages = ba_reviews_experts.groupby('language').agg(num_languages =('language', 'count'))
ba_total_language = ba_languages['num_languages'].sum()
ba_languages['review_proportion_percentage'] = (ba_languages['num_languages'] / ba_total_language * 100).round(5)
ba_languages = ba_languages.sort_values(by= 'num_languages', ascending= False)

In [14]:
ba_languages

Unnamed: 0_level_0,num_languages,review_proportion_percentage
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,2,100.0


**Comment:**

### Saving

In [8]:
ba_reviews_experts_en = ba_reviews_experts[ba_reviews_experts['language'] == 'en'].reset_index().copy()
ba_reviews_experts_en.to_parquet('../generated/ba_reviews_experts_en.parquet')

# RateBeer

In [9]:
rb_reviews_experts['language'] = rb_reviews_experts['text'].apply(detect_language_fasttext)

In [10]:
rb_languages = rb_reviews_experts.groupby('language').agg(num_languages =('language', 'count'))
rb_total_language = rb_languages['num_languages'].sum()
rb_languages['review_proportion_percentage'] = (rb_languages['num_languages'] / rb_total_language * 100).round(5)
rb_languages = rb_languages.sort_values(by= 'num_languages', ascending= False)

In [11]:
rb_languages

Unnamed: 0_level_0,num_languages,review_proportion_percentage
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,3412968,95.87487
de,44389,1.24695
fr,36989,1.03907
no,18628,0.52329
sv,11912,0.33462
pl,8238,0.23142
it,8084,0.22709
nl,6990,0.19636
es,6909,0.19408
da,4432,0.1245


**Comment:**

### Saving

In [12]:
rb_reviews_experts_en = rb_reviews_experts[rb_reviews_experts['language'] == 'en'].reset_index().copy()
rb_reviews_experts_en.to_parquet('../generated/rb_reviews_experts_en.parquet')