## Coding a Language Detection model in Python 

In [3]:
# Use for Installing necessary package if not installed already
#pip install transformers torch datasets pandas scikit-learn nltk

### Using Roberta Base Pretrained model

Importing Necessary Libraries 

In [4]:
import pandas as pd
import re
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset, DatasetDict, ClassLabel
import torch
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


Add your file path here

In [6]:
# Load the dataset
file_path = '/Users/apple/Desktop/PG/Summer-24/NLP/nlp-language-detection/dataset/language-detection-full-dataset.csv'
df = pd.read_csv(file_path)

In [7]:
# Data Preprocessing
df['Text'] = df['Text'].str.lower().apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [8]:
# Mapping Different languages with their respective Labels
label_to_language = df[['Label', 'Language']].drop_duplicates().set_index('Label').to_dict()['Language']
language_to_label = {v: k for k, v in label_to_language.items()}

In [9]:
# Converting ASCII Language Code to Full Name 
language_code_to_name = {
    "af": "Afrikaans",
    "ar": "Arabic",
    "bg": "Bulgarian",
    "bn": "Bengali",
    "de": "German",
    "el": "Greek",
    "en": "English",
    "es": "Spanish",
    "et": "Estonian",
    "fa": "Persian",
    "fi": "Finnish",
    "fr": "French",
    "gu": "Gujarati",
    "he": "Hebrew",
    "hi": "Hindi",
    "hr": "Croatian",
    "hu": "Hungarian",
    "id": "Indonesian",
    "it": "Italian",
    "ja": "Japanese",
    "kn": "Kannada",
    "ko": "Korean",
    "lt": "Lithuanian",
    "lv": "Latvian",
    "ml": "Malayalam",
    "mr": "Marathi",
    "ne": "Nepali",
    "nl": "Dutch",
    "no": "Norwegian",
    "pa": "Punjabi",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "si": "Sinhala",
    "sk": "Slovak",
    "sl": "Slovenian",
    "sq": "Albanian",
    "sv": "Swedish",
    "sw": "Swahili",
    "ta": "Tamil",
    "te": "Telugu",
    "th": "Thai",
    "tl": "Tagalog",
    "tr": "Turkish",
    "uk": "Ukrainian",
    "ur": "Urdu",
    "vi": "Vietnamese",
    "zh": "Chinese"
}

In [10]:
# Load a pretrained language detection model
model_name = "papluca/xlm-roberta-base-language-detection"
language_detection_pipeline = pipeline("text-classification", model=model_name)

In [9]:
# Prediction Function using pretrained model
def predict_language(text):
    # Preprocess the input text
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    # Predict the language label using the pretrained model
    prediction = language_detection_pipeline(text)
    # Extract the predicted language code from the model output
    predicted_code = prediction[0]['label']
    # Convert language code to full language name
    language_name = language_code_to_name.get(predicted_code, "Unknown")
    return language_name

In [None]:
# Testing the prediction function
sample_text = "Insert Text Here"
predicted_language = predict_language(sample_text)
print(f"The predicted language for the input text is: {predicted_language}")

## Model Number 2

In [None]:


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the CSV file
df = pd.read_csv('/Users/apple/Desktop/PG/Summer-24/NLP/language-detection-full-dataset.csv')

# Data cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply data cleaning to the 'Text' column
df['cleaned_text'] = df['Text'].apply(clean_text)

# Split the data into features (X) and labels (y)
X = df['cleaned_text']
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 3), analyzer='char_wb', max_features=50000)),
    ('clf', RandomForestClassifier(n_jobs=-1, random_state=42))
])

# Define hyperparameters for tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Function to predict language for new text
def predict_language(text):
    cleaned = clean_text(text)
    prediction = best_model.predict([cleaned])[0]
    language = df[df['Label'] == prediction]['Language'].iloc[0]
    return language

# Test the model with some example sentences
examples = [
    "Hello, how are you?",
    "Bonjour, comment allez-vous?",
    "Hola, ¿cómo estás?",
    "Ciao, come stai?",
    "Hallo, wie geht es dir?"
]

print("\nPredictions for example sentences:")
for example in examples:
    predicted_language = predict_language(example)
    print(f"Text: '{example}' - Predicted Language: {predicted_language}")