In [48]:
# !pip install pandas
# !pip install numpy 
# !pip install scikit-learn


In [49]:
import pandas as pd
import numpy as np
import tqdm

In [50]:
#import dataset
data = pd.read_csv('language_dataset.csv')

In [67]:
#check first 10 rows
data.head(10)

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
5,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...,Japanese
6,tsutinalar i̇ngilizce tsuutina kanadada albert...,Turkish
7,müller mox figura centralis circulorum doctoru...,Latin
8,برقی بار electric charge تمام زیرجوہری ذرات کی...,Urdu
9,シャーリー・フィールドは、サン・ベルナルド・アベニュー沿い市民センターとrtマーティン高校に...,Japanese


In [66]:
#check for the shape of dataset and missing values
data.shape
data.isnull().sum()


Text        0
language    0
dtype: int64

In [52]:
#drop unnecessary column
data.drop(columns='Unnamed: 0', inplace=True)
data.columns

Index(['Text', 'language'], dtype='object')

In [None]:
#check for number of observations per language
data['language'].value_counts()

#### Feature Extraction

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [68]:
#separate X and Y features
x_data = data[['Text']]
y_data = data[['language']]
#extract X features and check new shape
vectorizer = CountVectorizer(analyzer = 'char' , ngram_range=(1,3))
x_data_vectorized = vectorizer.fit_transform(x_data['Text'])
x_data_vectorized.shape



(22000, 655714)

In [71]:
#check vocabulary list
print(vectorizer.vocabulary_)

{'k': 37835, 'l': 38801, 'e': 30980, 'm': 40205, 'n': 41616, 't': 49591, ' ': 0, 'g': 33633, 'o': 43366, 'w': 53120, 'a': 25612, 'd': 29762, 'i': 35608, 's': 47829, 'u': 51036, 'r': 46088, 'h': 34654, 'p': 44878, 'j': 37271, 'ä': 58204, 'v': 52307, 'z': 55049, 'í': 59351, '–': 163877, 'b': 27659, 'kl': 38199, 'le': 39146, 'em': 31793, 'me': 40551, 'en': 31843, 'nt': 42591, 't ': 49592, ' g': 828, 'go': 34120, 'ot': 44329, 'tt': 50363, 'tw': 50477, 'wa': 53205, 'al': 26372, 'ld': 39104, 'di': 30177, 'i ': 35609, ' s': 1413, 'su': 48838, 'ur': 51729, 'rn': 46843, 'nu': 42662, 'uk': 51485, 'ke': 38048, 'eh': 31607, 'ha': 34813, 'a ': 25613, ' p': 1291, 'pa': 44989, 'ls': 39574, 'sa': 48132, 'am': 26436, 'ee': 31498, 'er': 32007, 'ri': 46661, 'it': 36666, 'ti': 50060, ' n': 1199, 'ni': 42278, 'in': 36380, 'ng': 42188, 'g ': 33634, 'ai': 26256, 'ig': 36116, 'gu': 34262, 'ut': 51849, 'ta': 49815, 'at': 26796, ' m': 1129, 'ma': 40420, 'au': 26859, 'us': 51787, 'so': 48585, 'ol': 43909, 'eu': 

In [72]:
#split X and Y data with balanced number of observations in target variable
x_train, x_test, y_train, y_test = train_test_split(x_data_vectorized, y_data,
                                                     test_size= 0.3, 
                                                     random_state=34, 
                                                     stratify = y_data
                                                     )

#### Model Training

In [73]:
#import all modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report,f1_score
from tqdm import tqdm

In [None]:
#hyperparameter tuning for RF classifier
param_grid = {'max_depth': [5, 8, None],
              'max_features' : [5, 7, "auto"],
              'n_estimators': [50, 100, 150],
              'min_samples_split': [2, 8]
              }

new_rf = RandomForestClassifier(random_state= 12)
cv = GridSearchCV(estimator=new_rf, 
                  param_grid= param_grid,
                    cv= 5)
cv.fit(x_train, y_train['language'])
cv.best_estimator_

In [74]:
#specify model dictionary
models = {
     'Logistic Regression' : LogisticRegression(random_state=45,
                                                solver = 'saga'),
     'Multinomial NB' : MultinomialNB(),
     'SVC' : SVC(random_state=21),
     'RF Classifier' : RandomForestClassifier(max_features=5,
                                              min_samples_split=8,
                                              n_estimators=150,
                                              random_state=12)
}



In [75]:
for i in tqdm(range(len(list(models)))) :
    #fit model
    model_object = list(models.values())[i]
    model = model_object.fit(x_train, y_train['language'])
    

    #predict and collect metrics
    y_predicted = model.predict(x_test)
    model_accuracy = accuracy_score(y_test, y_predicted)
    model_precision = precision_score(y_test, y_predicted, average= "weighted")
    model_recall = recall_score(y_test, y_predicted, average= "weighted")
    model_f1_score = f1_score(y_test, y_predicted, average= "weighted" )
    
    #print metrics
    print(list(models.keys())[i])
    print(f"Accuracy : {model_accuracy}")
    print (f"Precision: {model_precision}")
    print(f"Recall : {model_recall}")
    print(f"F1 score : {model_f1_score}" )
    print('=' *35)
    print('\n')

 25%|██▌       | 1/4 [02:36<07:50, 156.96s/it]

Logistic Regression
Accuracy : 0.9810606060606061
Precision: 0.9825992647532256
Recall : 0.9810606060606061
F1 score : 0.9814505624348416




 50%|█████     | 2/4 [02:38<02:10, 65.32s/it] 

Multinomial NB
Accuracy : 0.9775757575757575
Precision: 0.9831285046483718
Recall : 0.9775757575757575
F1 score : 0.9789381501711459




 75%|███████▌  | 3/4 [06:08<02:11, 131.36s/it]

SVC
Accuracy : 0.9760606060606061
Precision: 0.9792461687781926
Recall : 0.9760606060606061
F1 score : 0.976890736561886




100%|██████████| 4/4 [07:38<00:00, 114.55s/it]

RF Classifier
Accuracy : 0.9827272727272728
Precision: 0.9853415339291794
Recall : 0.9827272727272728
F1 score : 0.9833582887561794







### Fit final model

In [77]:
#fit MultinomialNB model
MNB = MultinomialNB()
MNB_model = MNB.fit(x_train, y_train['language'])
MNB_lang_predict = MNB_model.predict(x_test)
MNB_lang_performance = classification_report(y_test, MNB_lang_predict)
print(accuracy_score(y_test, MNB_lang_predict))
print(MNB_lang_performance)

0.9775757575757575
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.98      0.98       300
       Dutch       0.98      0.98      0.98       300
     English       0.71      1.00      0.83       300
    Estonian       1.00      0.95      0.97       300
      French       0.98      1.00      0.99       300
       Hindi       1.00      0.97      0.98       300
  Indonesian       0.99      0.96      0.98       300
    Japanese       1.00      0.98      0.99       300
      Korean       1.00      0.99      0.99       300
       Latin       1.00      0.90      0.95       300
     Persian       1.00      1.00      1.00       300
   Portugese       1.00      0.95      0.97       300
      Pushto       1.00      0.95      0.97       300
    Romanian       1.00      0.98      0.99       300
     Russian       0.99      0.99      0.99       300
     Spanish       1.00      0.98      0.99       300
     Swe

In [78]:
#fit Logistic Regression model
LR_model = LogisticRegression(random_state=45, solver = 'saga').fit(x_train, y_train['language'])
LR_lang_predict = LR_model.predict(x_test)
LR_lang_performance = classification_report(y_test, LR_lang_predict)
print(accuracy_score(y_test, LR_lang_predict))
print(LR_lang_performance)



0.9810606060606061
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.98      0.99       300
       Dutch       0.98      0.99      0.98       300
     English       0.82      0.98      0.89       300
    Estonian       0.99      0.96      0.98       300
      French       0.99      1.00      0.99       300
       Hindi       1.00      0.97      0.98       300
  Indonesian       0.98      0.98      0.98       300
    Japanese       1.00      0.98      0.99       300
      Korean       1.00      0.99      0.99       300
       Latin       0.93      0.93      0.93       300
     Persian       1.00      1.00      1.00       300
   Portugese       0.98      0.95      0.96       300
      Pushto       0.99      0.96      0.97       300
    Romanian       1.00      0.98      0.99       300
     Russian       1.00      0.99      0.99       300
     Spanish       0.99      0.98      0.98       300
     Swe

In [79]:
#fit Random Forest Classifier model
RF_model = RandomForestClassifier(max_features=5,
                                  min_samples_split=8,
                                  n_estimators=150,
                                  random_state=12)
RF_model.fit(x_train, y_train['language'])
RF_lang_predict = RF_model.predict(x_test)
RF_lang_performance = classification_report(y_test, RF_lang_predict)
print(accuracy_score(y_test, RF_lang_predict))
print(RF_lang_performance)

0.9827272727272728
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.98      0.99       300
       Dutch       0.98      0.99      0.99       300
     English       0.78      0.99      0.88       300
    Estonian       1.00      0.96      0.98       300
      French       0.98      1.00      0.99       300
       Hindi       1.00      0.97      0.99       300
  Indonesian       1.00      0.98      0.99       300
    Japanese       1.00      0.98      0.99       300
      Korean       1.00      0.99      0.99       300
       Latin       0.97      0.93      0.95       300
     Persian       1.00      1.00      1.00       300
   Portugese       0.99      0.95      0.97       300
      Pushto       1.00      0.96      0.98       300
    Romanian       1.00      0.98      0.99       300
     Russian       0.99      0.99      0.99       300
     Spanish       0.99      0.98      0.99       300
     Swe

In [80]:
#fit Support Vector Classifier model 
SVC_model = SVC(random_state=21)
SVC_model.fit(x_train, y_train['language'])
SVC_lang_predict = SVC_model.predict(x_test)
SVC_lang_performance = classification_report(y_test, SVC_lang_predict)
print(accuracy_score(y_test, SVC_lang_predict))
print(SVC_lang_performance)

0.9760606060606061
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.98      0.97      0.97       300
       Dutch       0.97      0.99      0.98       300
     English       0.76      0.98      0.86       300
    Estonian       0.99      0.95      0.97       300
      French       0.99      0.99      0.99       300
       Hindi       1.00      0.97      0.98       300
  Indonesian       1.00      0.95      0.97       300
    Japanese       1.00      0.97      0.99       300
      Korean       1.00      0.99      0.99       300
       Latin       0.90      0.94      0.92       300
     Persian       1.00      1.00      1.00       300
   Portugese       0.98      0.94      0.96       300
      Pushto       1.00      0.96      0.98       300
    Romanian       1.00      0.98      0.99       300
     Russian       0.99      0.99      0.99       300
     Spanish       0.99      0.97      0.98       300
     Swe

### Testing model on new dataset

In [81]:
#function to predict a dataframe of text using MNB model
def predict_dataset_MNB(dataset):
 data_detected = False
 try:
  #try to read as csv
  new_test_data = pd.read_csv(dataset + '.csv', delimiter= ",")
  data_detected = True
 except FileNotFoundError:
   try:
   #try to read as txt
     new_test_data = pd.read_csv(dataset + '.txt', delimiter= ",")
     data_detected = True

   finally:
     if data_detected == True:
      new_data_x = new_test_data['text']
      new_data_y = new_test_data['language']
      new_data_vectorized = vectorizer.transform(new_data_x)
      new_predictions_NB = MNB_model.predict(new_data_vectorized)
      accuracy = accuracy_score(new_data_y, new_predictions_NB)
      classification_report_NB = classification_report(new_data_y, new_predictions_NB, zero_division = 0.0)
      print(accuracy)
      print(classification_report_NB)


In [82]:
#function to predict a dataframe of text using LR model
def predict_dataset_LR(dataset):
 data_detected = False
 try:
  #try to read as csv
  new_test_data = pd.read_csv(dataset + '.csv', delimiter= ",")
  data_detected = True
 except FileNotFoundError:
   try:
   #try to read as txt
     new_test_data = pd.read_csv(dataset + '.txt', delimiter= ",")
     data_detected = True

   finally:
     if data_detected == True:
      new_data_x = new_test_data['text']
      new_data_y = new_test_data['language']
      new_data_vectorized = vectorizer.transform(new_data_x)
      new_predictions_LR = LR_model.predict(new_data_vectorized)
      accuracy = accuracy_score(new_data_y, new_predictions_LR)
      classification_report_LR = classification_report(new_data_y, new_predictions_LR, zero_division = 0.0)
      print(accuracy)
      print(classification_report_LR)


In [83]:
#function to predict a dataframe of text using RF model
def predict_dataset_RF(dataset):
 data_detected = False
 try:
  #try to read as csv
  new_test_data = pd.read_csv(dataset + '.csv', delimiter= ",")
  data_detected = True
 except FileNotFoundError:
   try:
   #try to read as txt
     new_test_data = pd.read_csv(dataset + '.txt', delimiter= ",")
     data_detected = True

   finally:
     if data_detected == True:
      new_data_x = new_test_data['text']
      new_data_y = new_test_data['language']
      new_data_vectorized = vectorizer.transform(new_data_x)
      new_predictions_RF = RF_model.predict(new_data_vectorized)
      accuracy = accuracy_score(new_data_y, new_predictions_RF)
      classification_report_RF = classification_report(new_data_y, new_predictions_RF, zero_division = 0.0)
      print(accuracy)
      print(classification_report_RF)


In [84]:
#function to predict a dataframe of text using SVC model
def predict_dataset_SVC(dataset):
 data_detected = False
 try:
  #try to read as csv
  new_test_data = pd.read_csv(dataset + '.csv', delimiter= ",")
  data_detected = True
 except FileNotFoundError:
  try:
   #try to read as txt
   new_test_data = pd.read_csv(dataset + '.txt', delimiter= ",")
   data_detected = True

  finally:
   if data_detected == True:
    new_data_x = new_test_data['text']
    new_data_y = new_test_data['language']
    new_data_vectorized = vectorizer.transform(new_data_x)
    new_predictions_SVC = SVC_model.predict(new_data_vectorized)
    accuracy = accuracy_score(new_data_y, new_predictions_SVC)
    classification_report_SVC = classification_report(new_data_y, new_predictions_SVC, zero_division = 0.0)
    print(accuracy)
    print(classification_report_SVC)

In [85]:
#predict new dataset with MNB
predict_dataset_MNB('multilingual-100')



0.9545454545454546
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00         6
     Chinese       1.00      1.00      1.00         6
       Dutch       1.00      1.00      1.00         6
     English       1.00      1.00      1.00         6
    Estonian       1.00      1.00      1.00         6
      French       1.00      1.00      1.00         6
       Hindi       1.00      1.00      1.00         6
  Indonesian       1.00      1.00      1.00         6
    Japanese       1.00      1.00      1.00         6
      Korean       1.00      1.00      1.00         6
       Latin       1.00      1.00      1.00         6
     Persian       1.00      1.00      1.00         6
   Portugese       0.00      0.00      0.00         0
  Portuguese       0.00      0.00      0.00         6
      Pushto       1.00      1.00      1.00         6
    Romanian       1.00      1.00      1.00         6
     Russian       1.00      1.00      1.00         6
     Spa

In [86]:
#predict new dataset with LR
predict_dataset_LR('multilingual-100')


0.9090909090909091
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00         6
     Chinese       1.00      1.00      1.00         6
       Dutch       1.00      1.00      1.00         6
     English       1.00      0.67      0.80         6
    Estonian       1.00      1.00      1.00         6
      French       1.00      0.83      0.91         6
       Hindi       1.00      1.00      1.00         6
  Indonesian       1.00      1.00      1.00         6
    Japanese       1.00      1.00      1.00         6
      Korean       1.00      1.00      1.00         6
       Latin       0.62      0.83      0.71         6
     Persian       0.75      1.00      0.86         6
   Portugese       0.00      0.00      0.00         0
  Portuguese       0.00      0.00      0.00         6
      Pushto       1.00      0.67      0.80         6
    Romanian       1.00      1.00      1.00         6
     Russian       1.00      1.00      1.00         6
     Spa

In [87]:
#predict new dataset with RF
predict_dataset_RF('multilingual-100')

0.8560606060606061
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00         6
     Chinese       1.00      0.17      0.29         6
       Dutch       1.00      1.00      1.00         6
     English       1.00      0.83      0.91         6
    Estonian       0.55      1.00      0.71         6
      French       1.00      1.00      1.00         6
       Hindi       1.00      1.00      1.00         6
  Indonesian       1.00      0.83      0.91         6
    Japanese       0.55      1.00      0.71         6
      Korean       1.00      1.00      1.00         6
       Latin       0.83      0.83      0.83         6
     Persian       0.86      1.00      0.92         6
   Portugese       0.00      0.00      0.00         0
  Portuguese       0.00      0.00      0.00         6
      Pushto       1.00      0.83      0.91         6
    Romanian       1.00      1.00      1.00         6
     Russian       1.00      1.00      1.00         6
     Spa

In [88]:
#predict new dataset with SVC
predict_dataset_SVC('multilingual-100')

0.09090909090909091
              precision    recall  f1-score   support

      Arabic       0.00      0.00      0.00         6
     Chinese       0.05      1.00      0.09         6
       Dutch       0.00      0.00      0.00         6
     English       0.00      0.00      0.00         6
    Estonian       1.00      0.17      0.29         6
      French       0.00      0.00      0.00         6
       Hindi       0.00      0.00      0.00         6
  Indonesian       0.00      0.00      0.00         6
    Japanese       0.00      0.00      0.00         6
      Korean       0.00      0.00      0.00         6
       Latin       0.00      0.00      0.00         6
     Persian       0.00      0.00      0.00         6
  Portuguese       0.00      0.00      0.00         6
      Pushto       0.00      0.00      0.00         6
    Romanian       0.00      0.00      0.00         6
     Russian       0.00      0.00      0.00         6
     Spanish       0.00      0.00      0.00         6
     Sw

In [89]:
#function to predict a single text
def predict_text_MNB(text):
    text = vectorizer.transform([text])
    prediction = MNB_model.predict(text)
    print(prediction)

In [90]:
def predict_text_LR(text):
    text = vectorizer.transform([text])
    prediction = LR_model.predict(text)
    print(prediction)

In [91]:
def predict_text_RF(text):
    text = vectorizer.transform([text])
    prediction = RF_model.predict(text)
    print(prediction)

In [92]:
def predict_text_SVC(text):
    text = vectorizer.transform([text])
    prediction = SVC_model.predict(text)
    print(prediction)

In [93]:
text = "如"
predict_text_LR(text)
predict_text_MNB(text)
predict_text_RF(text)
predict_text_SVC(text)


['Korean']
['Chinese']
['Dutch']
['Chinese']
