
##### https://www.tutorialspoint.com/python_text_processing/python_synonyms_and_antonyms.htm

In [7]:
pip install translate

Collecting translate
  Downloading https://files.pythonhosted.org/packages/85/b2/2ea329a07bbc0c7227eef84ca89ffd6895e7ec237d6c0b26574d56103e53/translate-3.5.0-py2.py3-none-any.whl
Collecting tox
[?25l  Downloading https://files.pythonhosted.org/packages/c6/36/b0e6016724ae598ba30168618a91d3ee2d7510c20cb05deb8cdc3f7d53b3/tox-3.15.2-py2.py3-none-any.whl (137kB)
[K     |████████████████████████████████| 143kB 8.3MB/s 
[?25hCollecting pre-commit
[?25l  Downloading https://files.pythonhosted.org/packages/50/38/1e3387cf2621e560c59364f01d3c0b8443ee0c07a4c003502f07a8f372df/pre_commit-2.5.1-py2.py3-none-any.whl (171kB)
[K     |████████████████████████████████| 174kB 12.0MB/s 
Collecting six>=1.14.0
  Downloading https://files.pythonhosted.org/packages/ee/ff/48bde5c0f013094d729fe4b0316ba2a24774b3ff1c52d924a8a4cb04078a/six-1.15.0-py2.py3-none-any.whl
Collecting virtualenv!=20.0.0,!=20.0.1,!=20.0.2,!=20.0.3,!=20.0.4,!=20.0.5,!=20.0.6,!=20.0.7,>=16.0.0
[?25l  Downloading https://files.pythonhos

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
from bson.objectid import ObjectId
import pandas as pd
import os

In [10]:
data_path = '/content/drive/My Drive/data/'

In [11]:
class Word():
  word = ''
  stemming = ''
  pos_tag = ''
  tag_desc = ''
  count = 0
  book_id = ObjectId
  is_training = False
  is_learned = False
  translate = ''

  def __init__(self):
    pass

  def __repr__(self):
    return f'{self.word} - {self.count}\nstemming: {self.stemming}\npos_tag: {self.pos_tag}\ntag_desc: {self.tag_desc}\n\n'

### work with human language data

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tag import pos_tag

from translate import Translator

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

describes some constants to work with text

In [14]:
tags_desc = {
        'CC': 'coordinating conjunction',
        'CD': 'cardinal digit',
        'DT': 'determiner',
        'EX': 'existential there (like: “there is” … think of it like “there exists”)',
        'FW': 'foreign word',
        'IN': 'preposition/subordinating conjunction',
        'JJ': 'adjective ‘big’',
        'JJR': 'adjective, comparative ‘bigger’',
        'JJS': 'adjective, superlative ‘biggest’',
        'LS': 'list marker 1)',
        'MD': 'modal could, will',
        'NN': 'noun, singular ‘desk’',
        'NNS': 'noun plural ‘desks’',
        'NNP': 'proper noun, singular ‘Harrison’',
        'NNPS': 'proper noun, plural ‘Americans’',
        'PDT': 'predeterminer ‘all the kids’',
        'POS': 'possessive ending parent’s',
        'PRP': 'personal pronoun I, he, she',
        'PRP$': 'possessive pronoun my, his, hers',
        'RB': 'adverb very, silently,',
        'RBR': 'adverb, comparative better',
        'RBS': 'adverb, superlative best',
        'RP': 'particle give up',
        'TO': 'to go ‘to’ the store.',
        'UH': 'interjection, errrrrrrrm',
        'VB': 'verb, base form take',
        'VBD': 'verb, past tense took',
        'VBG': 'verb, gerund/present participle taking',
        'VBN': 'verb, past participle taken',
        'VBP': 'verb, sing. present, non-3d take',
        'VBZ': 'verb, 3rd person sing. present takes',
        'WDT': 'wh-determiner which',
        'WP': 'wh-pronoun who, what',
        'WP$': 'possessive wh-pronoun whose',
        'WRB': 'wh-abverb where, when',
        '': 'none',
        '.': 'none'
    }

In [15]:
stop_words = stopwords.words('english')

In [16]:
punctuations = ['(', ')', ';', ':', '[', ']', '!', '?', ',', '!', '=', '==', '<', '>', '@', '#', '$', '%', '^',
                        '&', '*',
                        '.', '//', '{', '}', '...', '``', '+', "''", ]

### read file into string

In [17]:
file_name = 'The Rules of Work.txt'
file_path = data_path + file_name

In [18]:
with open(file_path, 'r') as file:
    data = file.read()

### parse text to object

In [19]:
def encode_word(word):
    return word.lower().encode('ascii', 'ignore').decode('ascii')


In [20]:
tokens = word_tokenize(data)

In [21]:
tokens = [encode_word(word) for word in tokens
                if not word in stop_words
                and not word in punctuations
                and not word.isnumeric()
                and len(word) > 2]

In [22]:
fdist = FreqDist(tokens)
most_common = fdist.most_common()

In [23]:
def stemming_word(word):
    pst = PorterStemmer()
    return pst.stem(word)

In [24]:
def freq_to_obj(word, freq, book_id):
    if not word:
        return Word()

    #translator = Translator()

    postag = pos_tag(word)[0] if len(pos_tag(word)) > 1 else ('', '')
    tag = postag[1]
    desc = tags_desc[tag] if tag in tags_desc else ''
    stemming = stemming_word(word)

    translator = Translator(to_lang="uk")
    translate = translator.translate(word)
    # try:
    #     trans = translator.translate(word, src='en', dest='uk')
    #     translate = trans.text
    # except:
    #     print('can not translate - ' + word)

    wordObj = Word()
    wordObj.word=word
    wordObj.stemming=stemming
    wordObj.pos_tag=tag
    wordObj.tag_desc=desc
    wordObj.count=freq
    wordObj.book_id=book_id
    wordObj.translate=translate
    wordObj.is_training=False
    wordObj.is_learned=False

    return wordObj

In [25]:
book_id = 1

In [26]:
words = [freq_to_obj(word, freq, book_id) for word, freq in most_common]

### Convert data into dataframe and save it

In [27]:
df = pd.DataFrame([t.__dict__ for t in words ])

In [28]:
base = os.path.basename(file_path)
csv_name = os.path.splitext(base)[0]
csv_path = data_path + csv_name + '.csv'

In [29]:
df.to_csv(csv_path)

In [30]:
#pip install translate