In [30]:
# !pip install pandas
# !pip install numpy 
# !pip install scikit-learn
!where python


c:\Users\LENOVO\anaconda3\python.exe
C:\Users\LENOVO\AppData\Local\Microsoft\WindowsApps\python.exe


In [3]:
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns  
import matplotlib.pyplot as plt 

#### DATASET OVERVIEW

In [9]:
#import dataset
pd.set_option('display.max_rows', None)
data = pd.read_csv('datasets\\language_dataset.csv')
data.head(10)

Unnamed: 0.1,Unnamed: 0,Text,language
0,0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,4,de spons behoort tot het geslacht haliclona en...,Dutch
5,5,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...,Japanese
6,6,tsutinalar i̇ngilizce tsuutina kanadada albert...,Turkish
7,7,müller mox figura centralis circulorum doctoru...,Latin
8,8,برقی بار electric charge تمام زیرجوہری ذرات کی...,Urdu
9,9,シャーリー・フィールドは、サン・ベルナルド・アベニュー沿い市民センターとrtマーティン高校に...,Japanese


In [10]:
#check for the shape of dataset and missing values
data.shape
data.isnull().sum()

Unnamed: 0    0
Text          0
language      0
dtype: int64

In [11]:
#drop unnecessary column
data.drop(columns='Unnamed: 0', inplace=True)
data.columns

Index(['Text', 'language'], dtype='object')

In [13]:
#check for number of observations per language
data.replace('Portugese', 'Portuguese', inplace=True)
data['language'].value_counts()

language
Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portuguese    1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: count, dtype: int64

#### FEATURE EXTRACTION

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [15]:
#separate X and Y features
x_data = data['Text']
y_data = data['language']

#split X and Y data with balanced number of observations in target variable
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data,
                                                     test_size= 0.3, 
                                                     random_state=34, 
                                                     stratify = y_data
                                                     )



In [16]:

#extract X features and check new shape
vectorizer = CountVectorizer(analyzer = 'char' , ngram_range=(1,4))
x_train_vectorized = vectorizer.fit_transform(x_train)
x_train_vectorized.shape

(15400, 1305683)

In [None]:
#check vocabulary list
print(vectorizer.vocabulary_)

#### MODEL TRAINING

In [17]:
#import all modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,f1_score
from tqdm import tqdm

In [12]:
#specify model dictionary
models = {
     'Logistic Regression' : LogisticRegression(),
     'Multinomial NB' : MultinomialNB(),
     'SVC' : SVC(),
     'RF Classifier' : RandomForestClassifier()
}


In [None]:
#Train and evaluate train metrics
print('TRAINING SET METRICS')
for i in tqdm(range(len(list(models)))) :
    #fit model
    model_object = list(models.values())[i]
    model = model_object.fit(x_train_vectorized, y_train)


    #predict and collect metrics
    y_train_predicted = model.predict(x_train_vectorized)


    model_train_accuracy = accuracy_score(y_train, y_train_predicted)
    model_train_precision = precision_score(y_train, y_train_predicted, average= "weighted")
    model_train_recall = recall_score(y_train, y_train_predicted, average= "weighted")
    model_train_f1_score = f1_score(y_train, y_train_predicted, average= "weighted" )


    #print metrics
    print(list(models.keys())[i])
    print(f"Accuracy : {model_train_accuracy:.4f}")
    print (f"Precision: {model_train_precision:.4f}")
    print(f"Recall : {model_train_recall:.4f}")
    print(f"F1 score : {model_train_f1_score:.4f}" )
    print('=' *35)
    print('\n')

In [None]:
#train and evaluate test metrics
print('TESTING SET METRICS')
for i in tqdm(range(len(list(models)))) :
    #fit model
    model_object = list(models.values())[i]
    model = model_object.fit(x_train_vectorized, y_train)


    #predict and collect metrics
    x_test_vectorized = vectorizer.transform(x_test)
    y_test_predicted = model.predict(x_test_vectorized)


    model_test_accuracy = accuracy_score(y_test, y_test_predicted)
    model_test_precision = precision_score(y_test, y_test_predicted, average= "weighted")
    model_test_recall = recall_score(y_test, y_test_predicted, average= "weighted")
    model_test_f1_score = f1_score(y_test, y_test_predicted, average= "weighted" )

    #print metrics
    print(list(models.keys())[i])
    print(f"Accuracy : {model_test_accuracy:.4f}")
    print (f"Precision: {model_test_precision:.4f}")
    print(f"Recall : {model_test_recall:.4f}")
    print(f"F1 score : {model_test_f1_score:.4f}" )
    print('=' *35)
    print('\n')

#### TRAIN FINAL MODEL AND EVALUATE ON NEW DATASETS

In [18]:
#function to plot confusion matrix
def confusion(predicted_y, y):
  plt.figure(figsize=(15,10))
  languages = np.unique(y)
  cm2 = confusion_matrix(y, predicted_y, labels= languages)
  sns.heatmap(cm2, annot=True, fmt='d', cmap='Blues')
  plt.xticks(np.arange(len(languages)) + 0.5, languages, rotation= 45)
  plt.yticks(np.arange(len(languages)) + 0.5, languages, rotation = 360)
  plt.tight_layout()
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.show()

In [None]:
#train and evaluate MNB model
MNB_object = MultinomialNB()
MNB_model = MNB_object.fit(x_train_vectorized, y_train)
x_test_vectorized = vectorizer.transform(x_test)
mnb_predicted = MNB_model.predict(x_test_vectorized)
mnb_accuracy = accuracy_score(y_test, mnb_predicted)
print(f"Accuracy : {mnb_accuracy:.4f}")
confusion(mnb_predicted, y_test)

In [43]:
#function to check the accuracy and confusion matrix of the model on a dataframe of text
'''def predict_dataset_MNB(dataset):
  new_test_data = None
  for ext in ['.csv', '.txt']:
    try:
      new_test_data = pd.read_csv("datasets\\" + dataset + ext,   delimiter= ",") 
      break #exit loop if file is found
    except FileNotFoundError:
      pass #continue to check next extension
  if new_test_data is None:   #if file is not found
      print('File not found')
      return #exit function
  new_data_x = new_test_data['text']
  new_data_y = new_test_data['language']
  new_data_vectorized = vectorizer.transform(new_data_x)
  new_predictions_NB = MNB_model.predict(new_data_vectorized)
  accuracy = accuracy_score(new_data_y, new_predictions_NB)
  print(f"Accuracy : {accuracy:.4f}")
  confusion(new_predictions_NB, new_data_y) '''

In [25]:
#function to predict a single text
'''def predict_text_MNB(text):
    text = vectorizer.transform([text])
    prediction = MNB_model.predict(text)
    print(prediction) '''

In [None]:
#use function to check new dataset metrics
'''predict_dataset_MNB('multilingual-100')
predict_dataset_MNB('multilingual-sentences-dataset')'''

In [None]:
#load new dataset and check metrics
new_data_2 = pd.read_csv('multilingual-sentences-dataset.txt', delimiter=',')
new_data_2.head()
new_data_2_x = new_data_2['text']
new_data_2_y = new_data_2['language']
new_data_2_x_vectorized = vectorizer.transform(new_data_2_x)
new_data_2_x_vectorized.shape
new_data_2_predicted = MNB_model.predict(new_data_2_x_vectorized)
new_accuracy = accuracy_score(new_data_2_y, new_data_2_predicted)
print(f"Accuracy : {new_accuracy:.4f}")
confusion(new_data_2_predicted, new_data_2_y)

In [None]:
#create dataframe for misclassified languages
new_df = pd.concat([new_data_2_x, new_data_2_y, pd.Series(new_data_2_predicted)], axis=1)
new_df.columns = ['text','actual', 'predicted']
errors_df = new_df[new_df['actual'] != new_df['predicted']]
errors_df
