In [None]:
# !pip install pandas
# !pip install numpy 
# !pip install scikit-learn


In [28]:
import pandas as pd
import numpy as np
import tqdm

In [17]:
#import dataset
data = pd.read_csv('language_dataset.csv')

MINOR EDA   

In [18]:
#check for the shape of dataset and missing values
data.shape
data.isnull().sum()

Unnamed: 0    0
Text          0
language      0
dtype: int64

In [19]:
#drop unnecessary column
data.drop(columns='Unnamed: 0', inplace=True)
data.columns

Index(['Text', 'language'], dtype='object')

In [20]:
#check for number of observations per language
data['language'].value_counts()

language
Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: count, dtype: int64

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


FEATURE ENGINEERING

In [22]:
#extract feature to token
vectorizer = CountVectorizer()


In [23]:
#separate X and Y
x_data = data[['Text']]
y_data = data[['language']]

In [24]:
#extract X features and check new shape
x_data_vectorized = vectorizer.fit_transform(x_data['Text'])
x_data_vectorized.shape

(22000, 277720)

In [25]:
#split X and Y data with balanced number of observations in target variable
x_train, x_test, y_train, y_test = train_test_split(x_data_vectorized, y_data,
                                                     test_size= 0.3, 
                                                     random_state=34, 
                                                     stratify = y_data
                                                     )

In [30]:
#import all modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,f1_score
from tqdm import tqdm

In [27]:
#specify model dictionary
models = {
     'Logistic Regression' : LogisticRegression(),
     'Multinomial NB' : MultinomialNB(),
     'SVC' : SVC(),
     'RF Classifier' : RandomForestClassifier()
}


In [36]:
for i in tqdm(range(len(list(models)))) :
    #fit model
    model_object = list(models.values())[i]
    model = model_object.fit(x_train, y_train['language'])
    

    #predict and collect metrics
    y_predicted = model.predict(x_test)
    model_accuracy = accuracy_score(y_test, y_predicted)
    model_precision = precision_score(y_test, y_predicted, average= "weighted")
    model_recall = recall_score(y_test, y_predicted, average= "weighted")
    model_f1_score = f1_score(y_test, y_predicted, average= "weighted" )
    
    #print metrics
    print(list(models.keys())[i])
    print(f"Accuracy : {model_accuracy}")
    print (f"Precision: {model_precision}")
    print(f"Recall : {model_recall}")
    print(f"F1 score : {model_f1_score}" )
    print('=' *35)
    print('\n')

 25%|██▌       | 1/4 [00:29<01:29, 29.82s/it]

Logistic Regression
Accuracy : 0.9466666666666667
Precision: 0.9597950007946767
Recall : 0.9466666666666667
F1 score : 0.9490172242528225




 50%|█████     | 2/4 [00:30<00:25, 12.55s/it]

Multinomial NB
Accuracy : 0.9554545454545454
Precision: 0.9643046553216775
Recall : 0.9554545454545454
F1 score : 0.9552240505289885




 75%|███████▌  | 3/4 [01:29<00:34, 34.07s/it]

SVC
Accuracy : 0.894090909090909
Precision: 0.9196356565331157
Recall : 0.894090909090909
F1 score : 0.9003272492240816




100%|██████████| 4/4 [04:31<00:00, 67.80s/it]

RF Classifier
Accuracy : 0.9198484848484848
Precision: 0.9519753493117074
Recall : 0.9198484848484848
F1 score : 0.9182399733194789







TESTING MODELS ON A NEW TEXT

In [None]:

#text = '나는 센 강을 따라 걸으며 파리의 아름다움을 감상한다. 도시의 불빛이 밤에 별처럼 반짝인다'
#new_text = vectorizer.transform([text])

In [60]:
#multinomial naive bayes
#predict(new_text)

array(['Korean'], dtype='<U10')

In [None]:
#random forest
#RF_model.predict(new_text)

array(['Korean'], dtype=object)

In [67]:
#multinomial logistic regression
#multi_log.predict(new_text)

array(['Japanese'], dtype=object)

In [68]:
#support vector classifier
#SVC_model.predict(new_text)

array(['Japanese'], dtype=object)