In [50]:
import pandas as pd
import numpy as np
import plotly.express as px

In [56]:
file_path = "B:\_GITHUB\Data-Science-Projects\language detection\dataset\data.csv"
data = pd.read_csv(file_path)
# data.rename(columns = {'Text' : 'text'}, inplace = True, errors = 'raise')
data.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [57]:
# checking for null values
data.isnull().sum()

Text        0
language    0
dtype: int64

In [58]:
# checking data count
data['language'].value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

In [59]:
# checking total language number
unique_language = data['language'].unique()
len(unique_language)

22

In [61]:
# visualizing data count
value = data['language'].value_counts()
data_index = value.index
fig = px.pie(data, values = value, names = data_index, hole = 0.5, width = 750, height = 500)
fig.update_layout(title = 'Language data distribution ', title_x = 0.5)
fig.show()


#### **Model Selection**

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

x = np.array(data['Text'])
y = np.array(data['language'])

cv = CountVectorizer()
X = cv.fit_transform(x)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.33, random_state = 25)

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

In [65]:
# initiatialize models
bernouilli_model = BernoulliNB()
decision_tree_model = DecisionTreeClassifier()
kneighbors_model = KNeighborsClassifier()
logistic_regression_model = LogisticRegression()

# fitting models
bernouilli_fit = bernouilli_model.fit(xtrain, ytrain)
decision_tree_fit = decision_tree_model.fit(xtrain, ytrain)
kneighbors_fit = kneighbors_model.fit(xtrain, ytrain)
logistic_regression_fit = logistic_regression_model.fit(xtrain, ytrain)

model_performance = {

                    "Models" : ['Bernouilli Classifier', 'decision_tree', 'kneighbors', 'logistic_regression'],
                    "Performance" : [bernouilli_fit.score(xtest, ytest), decision_tree_fit.score(xtest, ytest), kneighbors_fit.score(xtest, ytest), 
                                      logistic_regression_fit.score(xtest, ytest)]
                    }

models_df = pd.DataFrame(model_performance)
models_df.sort_values(by=["Performance"], ascending=True)
# print(models_df.to_markdown())
models_df

Unnamed: 0,Models,Performance
0,Bernouilli Classifier,0.903719
1,decision_tree,0.893664
2,kneighbors,0.523967
3,logistic_regression,0.946143


In [71]:
# user testing
user_input = input("Enter your text: ")
user_data = cv.transform([user_input]).toarray()
models = [bernouilli_fit, decision_tree_fit, kneighbors_fit, logistic_regression_fit]
output = [model.predict(user_data) for model in models]
for idx, out in enumerate(output):
    print(f" {models[idx]} : {out}")

 BernoulliNB() : ['Chinese']
 DecisionTreeClassifier() : ['Dutch']
 KNeighborsClassifier() : ['Japanese']
 LogisticRegression() : ['Japanese']


### **conclusion**
We implemented various Machine Learning algorithms, in order to detect languages based on the language.
As we can see, the models do not perform well. We will design deep learning models. 