In [16]:
import pandas as pd
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import gradio as gr

# Load the dataset
df = pd.read_csv('lang_dataset.csv')
df = df.dropna()
df = df.reset_index(drop=True)

In [17]:
# Convert the text into numerical features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["Text"])

# Train the model
model = MultinomialNB()
model.fit(X, df["language"])

# Function to detect language using langdetect
def detect_language(text):
    try:
        lang = detect(text)
    except:
        lang = 'unknown'
    return lang

In [25]:
# create the Gradio interface to accept a text input and display the predicted language label
def predict_language(text):
    # First, try detecting language using langdetect
    lang = detect_language(text)
    if lang != 'unknown':
        return lang
    
    # If langdetect fails, use the trained model to predict language
    text_vectorized = vectorizer.transform([text])
    language = model.predict(text_vectorized)[0]
    return language

iface = gr.Interface(
    fn=predict_language,
    inputs=gr.inputs.Textbox(lines=5, label="Input Statement(at least 3 words)"),
    outputs=gr.outputs.Textbox(label="Language Prediction"),
    title="Language Identifier",
    description="Predict the language of a given statement.")
iface.launch()



Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


