-----------------

# <center> Language Identifier </center>

------------------------------

# Method 1: Using Python Module

------------------

## 1] langdetect

In [1]:
# Python program to demonstrate langdetect

from langdetect import detect

# Specifying the language for detection
print(detect("Hello and welcome to my language indentifier model"))
print(detect("Здравствуйте и добро пожаловать в мою модель языкового идентификатора"))
print(detect("Ciao e benvenuto nel mio modello di identificatore di lingua"))
print(detect("您好，欢迎来到我的语言标识符模型"))
print(detect("नमस्कार आणि माझ्या भाषा इंडेंटिफायर मॉडेलमध्ये स्वागत आहे"))
print(detect("Bonjour et bienvenue sur mon modèle d'identifiant de langue"))


en
ru
it
zh-cn
mr
fr


## 2] langid

In [2]:
# Python program to demonstrate langid


import langid


L = ["Hello and welcome to my language indentifier model",
     "Здравствуйте и добро пожаловать в мою модель языкового идентификатора", 
     "Ciao e benvenuto nel mio modello di identificatore di lingua", 
     "您好，欢迎来到我的语言标识符模型",
     "नमस्कार आणि माझ्या भाषा इंडेंटिफायर मॉडेलमध्ये स्वागत आहे",
     "Bonjour et bienvenue sur mon modèle d'identifiant de langue"]

for i in L:
    print(langid.classify(i))


('en', -60.289369106292725)
('ru', -1294.1317300796509)
('it', -179.1692442893982)
('zh', -202.62357568740845)
('mr', -370.4250023365021)
('fr', -184.70572137832642)


-----------------------------

# <center> Method 2: Using Machine Learning Algorithm </center>

----------------------

## Step 1: Import the libraries and dataset preprocessing

In [3]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

In [4]:
d = pd.read_csv("LanguageDetection.csv")
data = pd.DataFrame(d)
print(data)

                                                    Text Language
0       Nature, in the broadest sense, is the natural...  English
1      "Nature" can refer to the phenomena of the phy...  English
2      The study of nature is a large, if not the onl...  English
3      Although humans are part of nature, human acti...  English
4      [1] The word nature is borrowed from the Old F...  English
...                                                  ...      ...
10332  ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...  Kannada
10333  ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...  Kannada
10334  ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...  Kannada
10335  ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...  Kannada
10336  ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...  Kannada

[10337 rows x 2 columns]


In [5]:
#count value for each language
data["Language"].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [6]:
#Separating Independent and Dependent features
X = data["Text"]
y = data["Language"]

## Step 2: Label Encoding and Text Preprocessing

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
# creating a list for appending the preprocessed text
data_list = []
# iterating through all the text
for text in X:

 # removing the symbols and numbers
 text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
 text = re.sub(r'[[]]', ' ', text)

 # converting the text to lower case
 text = text.lower()

 # appending to data_list
 data_list.append(text)


In [9]:
#Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()
X.shape # (10337, 39419)

(10337, 34937)

## Step 3: Train Test Splitting

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y)

## Step 4: Model Training and Prediction

In [11]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

In [12]:
#Predicting the output for the test set.
y_pred = model.predict(x_test)

## Step 5: Model Evaluation

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy is :",ac)

Accuracy is : 0.9756286266924564


## Step 6: Making Predictions

In [14]:
def predict(text):

 x = cv.transform([text]).toarray() # converting text to bag of words model
 lang = model.predict(x) # predicting the language
 lang = le.inverse_transform(lang) # finding the language corresponding the the pr
 print("This langauge is : ",lang[0]) # printing the language

In [15]:
predict("Hey! Welcome to my Language Identifier Model")

This langauge is :  English


In [16]:
predict("ഹേയ്! എൻ്റെ ഭാഷാ ഐഡൻ്റിഫയർ മോഡലിലേക്ക് സ്വാഗതം")

This langauge is :  Malayalam


In [17]:
predict("Γεια σου! Καλώς ορίσατε στο Μοντέλο Αναγνωριστικού Γλώσσας")

This langauge is :  Greek


In [18]:
predict("ஏய்! எனது மொழி அடையாளங்காட்டி மாதிரிக்கு வரவேற்கிறோம்")

This langauge is :  Tamil
