In [15]:
# Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from sklearn import pipeline
from sklearn import linear_model
from sklearn import metrics

In [16]:
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def read_csv(file_name):
    return pd.read_csv(file_name, sep=",", header=0)

In [18]:
def clean_data(data):
    for punc in punctuation:
        data = data.replace(punc, "")
    data = data.lower()
    return data

In [19]:
def get_col(data, col_num):
    print(data[col_num])

In [20]:
data = read_csv("Language Detection.csv")
data["Text"] = data["Text"].apply(clean_data)

In [21]:
x = data.iloc[:,0] # Sentences
y = data.iloc[:,1] # Languages

In [22]:
# Create training and testing data
train_test = train_test_split(x, y, test_size=.2)
x_train = train_test[0]
x_test = train_test[1]
y_train = train_test[2]
y_test = train_test[3]

In [23]:
# Vectorise and create a model pipeline
vector  = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), analyzer="char")
model_pipe = pipeline.Pipeline([("vec", vector), ("clf", linear_model.LogisticRegression())])

In [24]:
model_pipe.fit(x_train, y_train)

In [25]:
predicted = model_pipe.predict(x_test)

In [26]:
metrics.accuracy_score(y_test, predicted)

0.973404255319149

In [43]:
# Predict the language of the given sentence
with open("sentence.txt", "r+") as file:
    sentences = file.readlines()

for sentence in sentences:
    print(model_pipe.predict([sentence]))

['Portugeese']
['French']
['English']
['Dutch']
['Spanish']


In [14]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)