In [22]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.simplefilter("ignore")

In [23]:
data = pd.read_csv('./Language Detection.csv')

In [24]:
data.head(10)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
5,"[2] In ancient philosophy, natura is mostly us...",English
6,"[3][4] \nThe concept of nature as a whole, the...",English
7,During the advent of modern scientific method ...,English
8,"[5][6] With the Industrial revolution, nature ...",English
9,"However, a vitalist vision of nature, closer t...",English


In [25]:
# value count for each language
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [26]:
X = data['Text']
y = data['Language']

In [27]:
# converting categorical variables to numerical

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [28]:
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

### Text preprocessing

In [29]:
data_list = []

for text in X:
    # 특수문자, 숫자, 공백 등 제거
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    # 대괄호 [] 를 공백으로 대체
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

### Train Test split

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

### Bag of Words

In [33]:
# creating bag of words using countvectorizer

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(X_train)

x_train = cv.transform(X_train).toarray()
x_test = cv.transform(X_test).toarray()

### Model creation and Prediction

In [34]:
# 베이즈 정리를 기반으로한 확률적인 분류 알고리즘
# Multinomial Naive Bayes 분류기는 텍스트 분류 등에서 자주 사용되며, 단어의 등장 빈도와 같은 이산적인 특성들을 가진 데이터를 처리하는 데 효과적
# 문서 분류, 스팸 분류 등 활용
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

In [35]:
y_pred = model.predict(x_test)

### Evaluation the model

In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [38]:
print('Accuracy is :', ac)

Accuracy is : 0.9816247582205029


In [39]:
# classification report
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       113
           1       0.99      0.90      0.94        90
           2       0.99      0.98      0.99       116
           3       0.93      1.00      0.96       297
           4       1.00      0.98      0.99       200
           5       1.00      0.99      0.99       100
           6       1.00      0.99      0.99        83
           7       1.00      1.00      1.00        13
           8       0.98      0.97      0.98       135
           9       1.00      0.97      0.99        75
          10       0.99      1.00      1.00       111
          11       0.99      0.99      0.99       153
          12       1.00      0.99      1.00       128
          13       0.99      0.98      0.98       144
          14       0.95      0.99      0.97       133
          15       1.00      1.00      1.00        91
          16       1.00      0.95      0.98        86

    accuracy              

In [40]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', cv), ('multinomialNB', model)])
pipe.fit(X_train, y_train)

In [41]:
y_pred2 = pipe.predict(X_test)
ac2 = accuracy_score(y_test, y_pred2)
print('Accuracy is : ', ac2)

Accuracy is :  0.9816247582205029


In [42]:
with open('trained_pipeline-0.1.0.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [45]:
text = "Hello, how are you?"
# text = 'Ciao, come stai?'

y = pipe.predict([text])
le.classes_[y[0]], y



('English', array([3]))