##### Установка сторонних модулей через pip

In [280]:
pip install nltk numpy pymorphy2

Note: you may need to restart the kernel to use updated packages.


##### Импорт библиотек

In [281]:
import json
import re
import nltk
import pymorphy2
import pickle
import pandas as pd

nltk.download('punkt');
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [282]:
PATH = './data/offers/data_original.json'
PATH_RESULT = './data/offers/data.json'
LANGUAGE = 'russian'
    
morph = pymorphy2.MorphAnalyzer()
stopwords = nltk.corpus.stopwords.words(LANGUAGE)
stemmer = nltk.stem.snowball.SnowballStemmer(LANGUAGE)

##### Удаление HTML тегов

In [283]:
def removeHtml(html):
    teg_re = re.compile(r'<[^>]+>')
    return teg_re.sub(' ', html)

##### Удаление demoji

In [284]:
def removeEmoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

##### Удаление всего кроме букв русских

In [285]:
def removeSpecialChar(text):
    teg_re = re.compile(r'[^А-Яа-я]+')
    return teg_re.sub(' ', text)

##### Несет ли смысловую нагрузку слово

In [286]:
def isMeaningWord(string):
    p = morph.parse(val)[0]
    if p.tag.POS == 'INTJ' or p.tag.POS == 'PRCL' or p.tag.POS == 'CONJ' or p.tag.POS == 'PREP' or p.tag.POS == 'PRED' or p.tag.POS == 'NPRO':
        return False
    return True

In [287]:
def tokenizeText(text):
    morph = pymorphy2.MorphAnalyzer()
    text = removeHtml(text)
    text = removeEmoji(text)
    text = removeSpecialChar(text)
    sentences = nltk.word_tokenize(text, language=LANGUAGE)
    tokens = []
    for val in sentences:
        val = val.lower()
        if val in stopwords:
            continue
        if isMeaningWord(val) == False:
            continue
        val = stemmer.stem(val)
        tokens.append(val)
    return ' '.join(tokens)

In [288]:
%%time
tagsFile = []
file = open(PATH, 'r')
line = file.readline()
while line:
    try:
        loaded_json = json.loads(line)
        tags = loaded_json['Tags']
        if tags:
            for val in tags:
                tagsFile.append(val)
    except KeyError as e:
        tagsFile.append('Unknown')
    line = file.readline()
file.close()

CPU times: user 103 ms, sys: 2.08 ms, total: 105 ms
Wall time: 110 ms


In [289]:
uniqueTags = set(tagsFile)
uniqueTags
print("Список тегов: " + str(uniqueTags))

Список тегов: {'Stylist', 'Visagiste', 'Unknown', 'Skilled', 'Advertising', 'Clip', 'Мale', 'Fashionshow', 'Videoshooting', 'Male', 'Photographer', 'Model', 'TVShow', 'Female', 'Photoshooting', 'Casting'}


In [290]:
%%time
dataFile = []
file = open(PATH, 'r')
line = file.readline()
while line:
    append = {}
    for val in uniqueTags:
        append[val] = 0
        
    loaded_json = json.loads(line)
    description = loaded_json['Description']
    description = tokenizeText(description)
    append['Text'] = description
    append['Unknown'] = 1
    try:
        tags = loaded_json['Tags']
        if tags:
            for val in tags:
                append[val] = 1
                append['Unknown'] = 0
    except KeyError as e:
        pass   
    dataFile.append(append)
    line = file.readline()
file.close()

CPU times: user 3min 44s, sys: 11.4 s, total: 3min 55s
Wall time: 3min 55s


In [291]:
with open(PATH_RESULT, 'w') as out:
    out.write( json.dumps(dataFile, ensure_ascii=False))

In [275]:
dataRead = []
with open(PATH_RESULT, 'r', encoding='utf-8') as jsonfile:    
    dataRead = json.load(jsonfile)