# **Installation des bibliothèques**

## FastText

In [1]:
!wget -O /tmp/lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
!pip install fasttext

--2022-03-16 13:17:00--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘/tmp/lid.176.bin’


2022-03-16 13:17:07 (20.8 MB/s) - ‘/tmp/lid.176.bin’ saved [131266198/131266198]



## PyCountry

In [2]:
!pip install pycountry



# **Importation des bibliothèques**

In [3]:
import fasttext
import pycountry
import pandas as pd

# **Développement du code**

In [4]:
class Langue :
    
    def __init__(self, text) :
        PRETRAINED_MODEL_PATH = '/tmp/lid.176.bin'
        model = fasttext.load_model(PRETRAINED_MODEL_PATH)
        label = model.predict(text.replace('\n', ' '))[0][0]
        code = label.replace("__label__", '')
        self.coef = model.predict(text.replace('\n', ' '))[1][0]
        language = None
        
        if len(code) == 2 :
            language = pycountry.languages.get(alpha_2=code)
            self.code = language.alpha_2
        else :
            if len(code) == 3 :
                language= pycountry.languages.get(alpha_3=code)
                self.code = language.alpha_3
        
        if language == None :
            self.code = "Aucun"
            self.name = "Aucun"
        else :
            self.name = language.name
            
        self.all = [self.name, self.code, self.coef]

# **Test 1**

In [5]:
texts = pd.read_csv('../input/language-detection/Language Detection.csv')
texts

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [6]:
texts['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [7]:
langues = []
dictionnaire = {}

for text in texts['Text']:
    langue = Langue(text)
    if langue.name in langues :
        dictionnaire[langue.name] = dictionnaire[langue.name] + 1
    else : 
        langues.append(langue.name)
        dictionnaire[langue.name] = 1

print(dictionnaire)



{'English': 1404, 'Spanish': 824, 'Italian': 698, 'Malayalam': 594, 'Hindi': 63, 'Tamil': 469, 'Portuguese': 729, 'French': 1019, 'Latin': 1, 'Dutch': 530, 'Afrikaans': 3, 'Russian': 690, 'Low German': 2, 'Danish': 391, 'Indonesian': 1, 'Turkish': 472, 'Catalan': 3, 'Lojban': 2, 'Modern Greek (1453-)': 365, 'Icelandic': 1, 'Serbian': 1, 'Norwegian': 25, 'Esperanto': 4, 'German': 471, 'Swedish': 663, 'Romanian': 1, 'Hungarian': 1, 'Estonian': 4, 'Ido': 1, 'Uzbek': 1, 'Norwegian Nynorsk': 1, 'Arabic': 532, 'Egyptian Arabic': 1, 'Uighur': 1, 'Kannada': 369}




# **Test 2**

In [8]:
texts = pd.read_csv('../input/language-identification-datasst/dataset.csv')
texts

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


In [9]:
texts['language'].value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

In [10]:
langues = []
dictionnaire = {}

for text in texts['Text']:
    langue = Langue(text)
    if langue.name in langues :
        dictionnaire[langue.name] = dictionnaire[langue.name] + 1
    else : 
        langues.append(langue.name)
        dictionnaire[langue.name] = 1

print(dictionnaire)



{'Estonian': 961, 'Norwegian': 3, 'Thai': 988, 'Tamil': 990, 'Dutch': 981, 'Japanese': 989, 'Turkish': 989, 'German': 39, 'Urdu': 954, 'Indonesian': 950, 'Portuguese': 950, 'French': 1040, 'Chinese': 1000, 'Korean': 991, 'English': 1330, 'Hindi': 981, 'Spanish': 990, 'Pushto': 934, 'Persian': 1006, 'Romanian': 985, 'Russian': 999, 'Latin': 869, 'Arabic': 1036, 'Swedish': 998, 'Serbian': 1, 'Italian': 15, 'Polish': 1, 'South Azerbaijani': 3, 'Ukrainian': 4, 'Malay (macrolanguage)': 14, 'Basque': 1, 'Croatian': 1, 'Western Panjabi': 2, 'Uighur': 1, 'Kurdish': 1, 'Finnish': 1, 'Esperanto': 1, 'Sindhi': 1}


