# Importing library


In [75]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Reading Dataset

In [76]:
data = pd.read_csv("/content/drive/MyDrive/Datasets_PROJECTS/Language Detection.csv")

data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


# Data Preprocessing

Checking Value counts for language column

In [77]:
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

Creating a function which will remove all the special character from our text column.

In [79]:
import string
print(string.punctuation)
#In the place of [!@#$(),\n%^&*?\:;~`0-9] we can also use string.punctuation
def remove_symbols(text):
      text = re.sub(r'[!@#$(),\n%^&*?\:;~`0-9]','',text)
      text = re.sub(r'[[]]','',text)
      
      return text.lower()

    

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Creating a function Which will remove the English Alphabets from other languages.

In [80]:
def remEngLet(text):
  text = re.sub(r'[a-zA-Z]+','',text)
  return text.lower()
  

Applying the functions

In [82]:
df = data.apply(lambda x: remEngLet(x.Text) if x.Language == ['Hindi','Kannada','Tamil','Arabic','Malayalam'] else x.Text, axis = 1)

In [83]:
df = df.apply(remove_symbols)


# Modeling

Creating two variables with two column.

In [107]:
y = data['Language']
x = df


Spliting our Dataset to train our model

In [85]:
x_train ,x_test, y_train, y_test = train_test_split(x,y,random_state=42)

Applying Tfidf vectorizer.

In [86]:
vect = TfidfVectorizer(ngram_range=(1,3),analyzer='char')

Applying Pipeline

In [87]:
mod = pipeline.Pipeline([('vectorizer', vect),('clf', LogisticRegression())])

In [88]:
mod.fit(x_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

# Model Score

In [89]:
prediction = mod.predict(x_test)

In [90]:
print("Accuracy:",accuracy_score(y_test,prediction))

Accuracy: 0.9825918762088974


In [91]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

      Arabic       0.99      1.00      1.00       134
      Danish       0.95      0.97      0.96        97
       Dutch       0.99      0.95      0.97       139
     English       0.96      0.99      0.98       364
      French       0.99      0.99      0.99       269
      German       0.98      0.97      0.97       116
       Greek       1.00      1.00      1.00        86
       Hindi       1.00      1.00      1.00        13
     Italian       0.98      0.97      0.97       180
     Kannada       1.00      1.00      1.00        78
   Malayalam       1.00      0.99      1.00       158
  Portugeese       0.98      0.97      0.97       170
     Russian       1.00      1.00      1.00       171
     Spanish       0.99      0.98      0.98       206
    Sweedish       0.97      0.96      0.97       162
       Tamil       1.00      1.00      1.00       114
     Turkish       0.98      0.99      0.98       128

    accuracy              

# Checking Model

In [112]:
one = x_test[55:56]
one

33    rock units are first emplaced either by deposi...
dtype: object

In [114]:
pred_one = mod.predict(one)
pred_one

array(['English'], dtype=object)

In [116]:
two = x_test[155:156]
two

3055    eu não estou interessado.
dtype: object

In [117]:
pred_two = mod.predict(two)
pred_two

array(['Portugeese'], dtype=object)

In [123]:
three = x_test[1330:1331]
three

9092    لذلك، لا تملي اللجنة محتوى المقالات، على الرغم...
dtype: object

In [124]:
pred_three = mod.predict(three)
pred_three

array(['Arabic'], dtype=object)

In [128]:
four = x_test[1167:1168]
four

1702    ഞാൻ ചോദിക്കുന്നു അവസാനം ഞാൻ പറയുന്നു.
dtype: object

In [129]:
pred_four = mod.predict(four)
pred_four

array(['Malayalam'], dtype=object)

# Creating Dataset

In [92]:
train = pd.DataFrame(x_train,columns=['Text'])
train['Language'] = y_train
train.to_csv('LD_train.csv')

In [93]:
test = pd.DataFrame(x_test,columns=['Text'])
test.to_csv('LD_test.csv')

In [94]:
ans = pd.DataFrame(y_test, columns= ['Language'])
ans['Prediction'] = prediction
ans.to_csv('LD_answer.csv')
