# Project 4 : Language Detection Using NLP

## Importing necessary libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

## Reading dataset

In [3]:
df=pd.read_csv("C:\\Users\\adite\\Downloads\\archive\\Language Detection.csv")
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


## Text Cleaning

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
# Creating a user define function for cleaning text

def remove_punctuations(text):
    for pun in string.punctuation:
        text=text.replace(pun,"")
    text=text.lower()
    return (text)

In [12]:
df.Text=df.Text.apply(remove_punctuations)

In [13]:
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


## Splitting Data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X=df.Text
y=df.Language

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=12)

In [20]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((8269,), (2068,), (8269,), (2068,))

## TFIDF and Vectorization

In [21]:
from sklearn import feature_extraction

In [23]:
vec=feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer="char")
vec

TfidfVectorizer(analyzer='char', ngram_range=(1, 2))

## Make pipeline of model

In [28]:
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression

In [30]:
NLP_model=pipeline.Pipeline([("vec",vec),("clf",LogisticRegression())])
NLP_model

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

## Model Training

In [32]:
NLP_model.fit(X_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [37]:
y_pred_train=NLP_model.predict(X_train)

In [38]:
from sklearn.metrics import accuracy_score as ac,confusion_matrix

In [42]:
training_accuracy=ac(y_train,y_pred_train)*100
training_accuracy

98.59717015358568

In [40]:
y_pred_test=NLP_model.predict(X_test)

In [43]:
testing_accuracy=ac(y_test,y_pred_test)*100
testing_accuracy

97.96905222437138

## Model testing with new sample

In [44]:
NLP_model.predict(["My name is Aditee"])

array(['English'], dtype=object)

In [46]:
NLP_model.predict(["मेरा नाम आदिती है"])

array(['Hindi'], dtype=object)

In [47]:
NLP_model.predict(["என் பெயர் அதிதி"])

array(['Tamil'], dtype=object)