# Importing library

In [48]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Reading Dataset

In [18]:
data = pd.read_csv("/content/drive/MyDrive/Datasets_PROJECTS/Language Detection.csv")

data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


# Data Preprocessing

In [27]:
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [20]:
data['Text'][0]

' Nature, in the broadest sense, is the natural, physical, material world or universe.'

In [24]:
import string
print(string.punctuation)
#In the place of [!@#$(),\n%^&*?\:;~`0-9] we can also use string.punctuation
def remove_symbols(text):
      text = re.sub(r'[!@#$(),\n%^&*?\:;~`0-9]','',text)
      text = re.sub(r'[[]]','',text)
      
      return text.lower()

    

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [25]:
def remEngLet(text):
  text = re.sub(r'[a-zA-Z]+','',text)
  return text.lower()
  

In [38]:
data[data.Language=='Sweedish'].sample(2)

Unnamed: 0,Text,Language
8742,Jag freakade ut en annan fras som modersmål an...,Sweedish
8900,ber någon vänta.,Sweedish


In [39]:
df = data.apply(lambda x: remEngLet(x.Text) if x.Language == ['Hindi','Kannada','Tamil','Arabic','Malayalam',] else x.Text, axis = 1)

In [54]:
df = df.apply(remove_symbols)


# Modeling

In [60]:
y = data['Language']
x = df
y

0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object

In [56]:
x_train ,x_test, y_train, y_test = train_test_split(x,y,random_state=42)

In [57]:
from pandas.io.parsers.readers import TextFileReader
vect = TfidfVectorizer(ngram_range=(1,3),analyzer='char')

In [61]:
mod = pipeline.Pipeline([('vectorizer', vect),('clf', LogisticRegression())])

In [62]:
mod.fit(x_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

# Result

In [63]:
prediction = mod.predict(x_test)

In [65]:
print("Accuracy:",accuracy_score(y_test,prediction))

Accuracy: 0.9825918762088974


In [66]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

      Arabic       0.99      1.00      1.00       134
      Danish       0.95      0.97      0.96        97
       Dutch       0.99      0.95      0.97       139
     English       0.96      0.99      0.98       364
      French       0.99      0.99      0.99       269
      German       0.98      0.97      0.97       116
       Greek       1.00      1.00      1.00        86
       Hindi       1.00      1.00      1.00        13
     Italian       0.98      0.97      0.97       180
     Kannada       1.00      1.00      1.00        78
   Malayalam       1.00      0.99      1.00       158
  Portugeese       0.98      0.97      0.97       170
     Russian       1.00      1.00      1.00       171
     Spanish       0.99      0.98      0.98       206
    Sweedish       0.97      0.96      0.97       162
       Tamil       1.00      1.00      1.00       114
     Turkish       0.98      0.99      0.98       128

    accuracy              

# Creating Dataset

In [70]:
train = pd.DataFrame(x_train,columns=['Text'])
train['Language'] = y_train
train.to_csv('LD_train.csv')

In [72]:
test = pd.DataFrame(x_test,columns=['Text'])
test.to_csv('LD_test.csv')

In [74]:
ans = pd.DataFrame(y_test, columns= ['Language'])
ans['Prediction'] = prediction
ans.to_csv('LD_answer.csv')
