# Language Detection
    
> Detecting the language of a feed and its articles (some feeds have articles in multiple languages). 

In [None]:
#| default_exp language_detection 

## Imports

In [None]:
#| export

import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Get Model and Tokenizer Files for the Language Detection Model

We have to download each `model_name` to the specified `model_path`. For the given `model_name`, the function will download all the appropriate model and tokenizer files to that path. If the specified path is not existing, then it will be created by the function.

In [None]:
#| export

def download_lang_model(model_path: str, model_name: str):
    """Download a Hugging Face language detection model and tokenizer 
    to the specified directory"""

    # Check if the directory already exists
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Save the model and tokenizer to the specified directory
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

## Detect Language

### Supported Languages

The languages currently supported are the ones supported by the `langdetect` module. Supported language codes are:

In [None]:
#| export

language_codes = [
    'ja',
    'nl',
    'ar',
    'pl',
    'de',
    'it',
    'pt',
    'tr',
    'es',
    'hi',
    'el',
    'ur',
    'bg',
    'en',
    'fr',
    'zh',
    'ru',
    'th',
    'sw',
    'vi'
]

### Load Model & Tokenizer

We load the model and tokenizer that we previously downloaded. Then we will pass a reference to the model and tokenizer to the `detect_language` function such that we don't have to load it every time we call it.

In [None]:
#| export

def load_model(model_path: str):
    """Load a Hugging Face model and tokenizer from the specified directory"""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    return model, tokenizer

### Detect Language

In [None]:
#| export

def detect_language(text: str, model, tokenizer):
    """Detect the language of a given text"""

    # Truncate the text to 512 characters
    text = text[:512]

    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt")

    # Get the prediction
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1)
    
    return(language_codes[int(predictions)])