In [48]:
# !pip install pandas
# !pip install numpy 
# !pip install scikit-learn


In [49]:
import pandas as pd
import numpy as np
import tqdm

In [50]:
#import dataset
data = pd.read_csv('language_dataset.csv')

In [None]:
#check for the shape of dataset and missing values
data.shape
data.isnull().sum()

In [52]:
#drop unnecessary column
data.drop(columns='Unnamed: 0', inplace=True)
data.columns

Index(['Text', 'language'], dtype='object')

In [None]:
#check for number of observations per language
data['language'].value_counts()

#### Feature Extraction

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [55]:
#separate X and Y features
x_data = data[['Text']]
y_data = data[['language']]

In [56]:
#extract X features and check new shape
vectorizer = CountVectorizer(ngram_range=(1,2))
x_data_vectorized = vectorizer.fit_transform(x_data['Text'])
x_data_vectorized.shape

(22000, 5239614)

In [57]:
#split X and Y data with balanced number of observations in target variable
x_train, x_test, y_train, y_test = train_test_split(x_data_vectorized, y_data,
                                                     test_size= 0.3, 
                                                     random_state=34, 
                                                     stratify = y_data
                                                     )

#### Model Training

In [58]:
#import all modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report,f1_score
from tqdm import tqdm

In [None]:
#hyperparameter tuning for RF classifier
param_grid = {'max_depth': [5, 8, None],
              'max_features' : [5, 7, "auto"],
              'n_estimators': [50, 100, 150],
              'min_samples_split': [2, 8]
              }

new_rf = RandomForestClassifier(random_state= 12)
cv = GridSearchCV(estimator=new_rf, 
                  param_grid= param_grid,
                    cv= 5)
cv.fit(x_train, y_train['language'])
cv.best_estimator_

In [59]:
#specify model dictionary
models = {
     'Logistic Regression' : LogisticRegression(random_state=45),
     'Multinomial NB' : MultinomialNB(),
     'SVC' : SVC(random_state=21),
     'RF Classifier' : RandomForestClassifier(max_features=5,
                                              min_samples_split=8,
                                              n_estimators=150,
                                              random_state=12)
}


In [None]:
for i in tqdm(range(len(list(models)))) :
    #fit model
    model_object = list(models.values())[i]
    model = model_object.fit(x_train, y_train['language'])
    

    #predict and collect metrics
    y_predicted = model.predict(x_test)
    model_accuracy = accuracy_score(y_test, y_predicted)
    model_precision = precision_score(y_test, y_predicted, average= "weighted")
    model_recall = recall_score(y_test, y_predicted, average= "weighted")
    model_f1_score = f1_score(y_test, y_predicted, average= "weighted" )
    
    #print metrics
    print(list(models.keys())[i])
    print(f"Accuracy : {model_accuracy}")
    print (f"Precision: {model_precision}")
    print(f"Recall : {model_recall}")
    print(f"F1 score : {model_f1_score}" )
    print('=' *35)
    print('\n')

### Fit final model

In [None]:
MNB = MultinomialNB()
MNB_model = MNB.fit(x_train, y_train['language'])
MNB_lang_predict = MNB_model.predict(x_test)
MNB_lang_performance = classification_report(y_test, MNB_lang_predict)
print(MNB_lang_performance)

### Testing model on new dataset

In [77]:
#function to predict a dataframe of text
def predict_dataset(dataset):
 data_detected = False
 try: 
  #try to read as csv
  new_test_data = pd.read_csv(dataset + '.csv', delimiter= ",")
  data_detected = True
 except FileNotFoundError:
   try:
   #try to read as txt
     new_test_data = pd.read_csv(dataset + '.txt', delimiter= ",")
     data_detected = True
   
   finally:
     if data_detected == True:
      new_data_x = new_test_data['text']
      new_data_y = new_test_data['language']
      new_data_vectorized = vectorizer.transform(new_data_x)
      new_predictions_NB = MNB_model.predict(new_data_vectorized)
      accuracy = accuracy_score(new_data_y, new_predictions_NB)
      print(accuracy)
      

In [78]:
#predict new dataset
predict_dataset('multilingual-100')


0.8484848484848485


In [17]:
#function to predict a single text
def predict_text(text):
    text = vectorizer.transform([text])
    prediction = MNB_model.predict(text)
    print(prediction)

In [84]:
text = "La musique adoucit les mœurs et inspire l'âme"
predict_text(text)

['French']
