In [6]:
import pandas as pd
import numpy as np

data = pd.read_csv("ln_dataset.csv", encoding="latin-1")
data.head(10)

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas pÃ¥ eng the jesui...,Swedish
2,à¸à¸à¸à¹à¸à¸£à¸´à¸à¸à¸£à¸¸à¸ à¸­à¸±à¸...,Thai
3,à®µà®¿à®à®¾à®à®ªà¯à®ªà®à¯à®à®¿à®©à®®à¯ ...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
5,ã¨ããè¡ããããã§ãã¹ã«ä¹ã£ã¦ã...,Japanese
6,tsutinalar iÌngilizce tsuutina kanadada alber...,Turkish
7,mÃ¼ller mox figura centralis circulorum doctor...,Latin
8,Ø¨Ø±ÙÛ Ø¨Ø§Ø± electric charge ØªÙØ§Ù Ø²ÛØ...,Urdu
9,ã·ã£ã¼ãªã¼ã»ãã£ã¼ã«ãã¯ããµã³ã...,Japanese


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['language'], test_size=0.2, random_state=42)

# create vectorizer with n-grams
vectorizer = CountVectorizer(ngram_range=(1,3))

# fit and transform the vectorizer on the training set
X_train_vectorized = vectorizer.fit_transform(X_train)

# train the classification model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

MultinomialNB()

In [8]:
# evaluate the model on the testing set
X_test_vectorized = vectorizer.transform(X_test)
y_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

      Arabic       0.85      0.97      0.90       179
     Chinese       0.92      0.98      0.95       192
       Dutch       1.00      1.00      1.00       223
     English       0.76      1.00      0.87       195
    Estonian       1.00      0.97      0.98       191
      French       0.97      0.98      0.98       173
       Hindi       1.00      0.97      0.99       215
  Indonesian       1.00      0.96      0.98       200
    Japanese       1.00      0.91      0.95       191
      Korean       1.00      1.00      1.00       204
       Latin       1.00      0.92      0.96       219
     Persian       0.97      0.97      0.97       208
   Portugese       0.99      0.97      0.98       201
      Pushto       0.97      0.94      0.95       197
    Romanian       0.99      0.98      0.98       189
     Russian       1.00      1.00      1.00       200
     Spanish       0.99      0.99      0.99       197
     Swahili       1.00    

In [9]:
import gradio as gr

# modify the predict_language() function to accept a text input, and return the predicted language label
def predict_language(text):
    detected_language = detect_language(text)
    text_vectorized = vectorizer.transform([text])
    language = model.predict(text_vectorized)[0]
    return f"{language} ({detected_language})"

# modify the Gradio interface to accept a text input and display the predicted language label
iface = gr.Interface(
    fn=predict_language,
    inputs=gr.inputs.Textbox(lines=5, label="Input Text"),
    outputs=gr.outputs.Textbox(label="Language Prediction"),
    title="Language Identifier",
    description="Predict the language of a given text."
)



In [10]:
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


