# Language Detection Using NLP

## Importing necessary libraries

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

## Loding dataset

In [102]:
df=pd.read_csv(r"C:\Users\DELL\Desktop\Language-Detection-using-NLP\text.csv")
df.head(50)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
5,"[2] In ancient philosophy, natura is mostly us...",English
6,"[3][4] \nThe concept of nature as a whole, the...",English
7,During the advent of modern scientific method ...,English
8,"[5][6] With the Industrial revolution, nature ...",English
9,"However, a vitalist vision of nature, closer t...",English


## Text Cleaning

## Removing Punctuations

In [103]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [104]:
# Creating a user define function for cleaning text

def remove_punctuations(text):
    for pun in string.punctuation:
        text=text.replace(pun,"")
    text=text.lower()
    return (text)

In [107]:
df['Cleaned_text'] = df['Text'].apply(remove_punctuations)

In [108]:
df.head()

Unnamed: 0,Text,Language,Cleaned_text
0,"Nature, in the broadest sense, is the natural...",English,nature in the broadest sense is the natural p...
1,"""Nature"" can refer to the phenomena of the phy...",English,nature can refer to the phenomena of the physi...
2,"The study of nature is a large, if not the onl...",English,the study of nature is a large if not the only...
3,"Although humans are part of nature, human acti...",English,although humans are part of nature human activ...
4,[1] The word nature is borrowed from the Old F...,English,1 the word nature is borrowed from the old fre...


## Splitting Data

In [109]:
from sklearn.model_selection import train_test_split

In [112]:
X=df.Cleaned_text
y=df.Language

In [132]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=12)

In [133]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((9303,), (1034,), (9303,), (1034,))

## TFIDF and Vectorization

In [134]:
from sklearn import feature_extraction

In [135]:
vec=feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer="char")
vec

## Make pipeline of model

In [136]:
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression

In [137]:
NLP_model=pipeline.Pipeline([("vec",vec),("clf",LogisticRegression())])
NLP_model

## Model Training

In [138]:
NLP_model.fit(X_train,y_train)

In [139]:
y_pred_train=NLP_model.predict(X_train)

In [140]:
from sklearn.metrics import accuracy_score as ac,confusion_matrix

In [141]:
training_accuracy=ac(y_train,y_pred_train)*100
training_accuracy

98.58110287004193

In [142]:
y_pred_test=NLP_model.predict(X_test)

In [143]:
testing_accuracy=ac(y_test,y_pred_test)*100
testing_accuracy

98.25918762088975

## Model testing with new sample

In [153]:
NLP_model.predict(["hello everyone my name is chetana"])

array(['English'], dtype=object)

In [151]:
NLP_model.predict(["छत्रपती शिवाजी महाराज की जय"])

array(['Hindi'], dtype=object)