In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import LabelEncoder
import re
import spacy
from gensim.models import Word2Vec

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/WLIT/data/LanguageDetection.csv")

In [None]:
language_list = ['English', 'French', 'Italian', 'Spanish', 'Portugese']
df = df.loc[df['Language'].isin(language_list)]
df

***Data Wrangling***

In [None]:
x = df['Text']
y = df['Language']

In [None]:
le = LabelEncoder()
y=le.fit_transform(y)

In [None]:
data = []

def removeNonsense(text):
  text = re.sub(r'[0-9]', '', text)
  text = re.sub(r'[\[\]]', '', text)
  text = re.sub(r'[\n]', '', text)

  text = text.lower()
  return text

df["Text"] = df["Text"].apply(removeNonsense)
df['Text']
# df['Text'].update(pd.Series(data))



In [None]:

!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm
!python -m spacy download it_core_news_sm
!python -m spacy download pt_core_news_sm
!python -m spacy download es_core_news_sm



***Text Pre-processing***

In [None]:
nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")
nlp_it = spacy.load("it_core_news_sm")
nlp_pr = spacy.load("pt_core_news_sm")
nlp_sp = spacy.load("es_core_news_sm")

lang_dict = {
    "English": nlp_en,
    "French": nlp_fr,
    "Italian": nlp_it,
    "Portugeese": nlp_pr,
    "Spanish": nlp_sp
    }

def removeNonsense(doc):
  return [token.lemma_ for token in doc if (not token.is_stop) and (not token.is_punct)]

for row in df.iterrows():
  doc = lang_dict[row[1][1]](row[1][0])
  print(removeNonsense(doc))




***Text Representation, Text -> Vector***

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
vect_list = count_vectorizer.fit_transform(df['Text']).toarray()

In [None]:
from sklearn.model_selection import train_test_split

training_x, testing_x, training_y, testing_y = train_test_split(vect_list, y, test_size = 0.40)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(training_x, training_y)

In [None]:
def predict_language(text):
  x = count_vectorizer.transform([text]).toarray()
  language = model.predict(x)
  language = le.inverse_transform(language)
  print(language[0])

# Saving the model for future use


In [35]:
!pip install joblib
import joblib

joblib.dump(model, '/content/drive/MyDrive/WLIT/model/multinomial_nb_model.pkl')




['/content/drive/MyDrive/WLIT/model/multinomial_nb_model.pkl']

Also save the Label Encoder and Count Vectorizer, because these will be used for pre-processing and prediciton.

In [36]:
joblib.dump(le, '/content/drive/MyDrive/WLIT/model/label_encoder.pkl')
joblib.dump(count_vectorizer, '/content/drive/MyDrive/WLIT/model/count_vectorizer.pkl')

['/content/drive/MyDrive/WLIT/model/count_vectorizer.pkl']

In [37]:
model = joblib.load('/content/drive/MyDrive/WLIT/model/multinomial_nb_model.pkl')
le = joblib.load('/content/drive/MyDrive/WLIT/model/label_encoder.pkl')
count_vectorizer = joblib.load('/content/drive/MyDrive/WLIT/model/count_vectorizer.pkl')


In [40]:
!pip freeze > '/content/drive/MyDrive/WLIT/requirments.txt'