In [4]:
import numpy as np
import pickle
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import re

In [5]:
#set input values
sent = "I lost my passport;*?"
from_language="EN"
to_language="PT"


In [6]:
# read python df of languages matching the "from_language" from the appropriate smart-named pkl file
pkl_file = open('../data/sentences_'+from_language+'.pkl', 'rb')
from_language_sentences = pickle.load(pkl_file)
pkl_file.close()

#create a list of the values to be parsed, using our translatable sentence as index 0
sent = re.sub('[!@#$%;:.?()"\'’,^\{\}\[\]|\\\/<>=`~*&]', '', sent).strip().lower()

corpus = [] 
corpus.append(sent)

#Append the appropriate language values
corpus= corpus + from_language_sentences['text'].tolist()


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
print (tfidf_matrix.shape)


(204, 319)


In [8]:
#grab the closest match of the sentence based on cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
c_similarity = []
c_similarity.append(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]))

#grab the closest match of the sentence based on cosine similarity
index_max = np.argmax(c_similarity)
print("The closest sentence match is: " + corpus[index_max+1])

#grab the matching sentence ID for the language using the identified text and convert it to a string
sentence_id = from_language_sentences.loc[from_language_sentences['text'] == corpus[index_max+1]]['sentence_id'].values
sentence_id = int(sentence_id)
print("That sentence is sentence ID: " + str(sentence_id))
percent_match = str(round(np.amax(c_similarity),4)*100) + "%"
print("The value of the cosine similarity for the sentences is: " + percent_match)

The closest sentence match is: i have lost my passport
That sentence is sentence ID: 801
The value of the cosine similarity for the sentences is: 91.84%


In [9]:
# read translations pandas df back from the pkl file
pkl_file = open('../data/translations.pkl', 'rb')
translations_df = pickle.load(pkl_file)
pkl_file.close()

translations_df.head()

Unnamed: 0,input_sentence_id,input_language_key,input_text,output_text,output_sentence_id,output_language_key
0,801,EN,I have lost my passport,Eu perdi meu passaporte,802,PT
1,801,EN,I have lost my passport,मैंने अपना पासपोर्ट खो दिया है,803,HI
2,801,EN,I have lost my passport,mainne apana paasaport kho diya hai,804,HI
3,802,PT,Eu perdi meu passaporte,मैंने अपना पासपोर्ट खो दिया है,803,HI
4,802,PT,Eu perdi meu passaporte,mainne apana paasaport kho diya hai,804,HI


In [10]:
#get the resulting translations for the input sentence and desired language
#translations_df.loc[(translations_df['input_sentence_id'] == sentence_id) & translations_df['output_language_key'] == to_language]
translations = translations_df.loc[(translations_df['output_language_key'] == to_language) & (translations_df['input_sentence_id'] == sentence_id)]
translations 

Unnamed: 0,input_sentence_id,input_language_key,input_text,output_text,output_sentence_id,output_language_key
0,801,EN,I have lost my passport,Eu perdi meu passaporte,802,PT


In [11]:
translations['input_text'].values[0]

'I have lost my passport'

In [12]:
translations['output_text'].values[0]

'Eu perdi meu passaporte'

In [13]:
from googletrans import Translator
translator = Translator()
translated = translator.translate(sent, src=from_language, dest=to_language)
translated.text

'eu perdi meu passaporte'