In [9]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
import pymorphy2
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora
import spacy
from gensim.models import LdaModel, LdaMulticore

def remove_stopwords(words):
    return [word for word in words if word not in stopwords.words('english')]

def prepare_text(text):
    return [remove_stopwords(gensim.utils.simple_preprocess(str(sentence), deacc=True)) \
               for sentence in text]

In [4]:
task_text = """
- Hi Jacob, how are you? 
- I'm fine, Nathan, how are you, Joseph? 
- I'm all good, Jacob, how are you, Natalie? 
- I'm okay, Joseph, how are you, Nikolas?
- Nothing is wrong, Natalie, how are you, Grian?
- Everything is pretty good, Nikolas,  how are you, Oliver?
- Fantastic, Grian! How are you, Mathew?
- It's never been better, Oliver, how are you, Minerva?
- Great as always, Mathew, how are you, Oleg?
"""

task_text = [gensim.utils.simple_preprocess(str(sentence), deacc=True) for sentence in nltk.sent_tokenize(task_text)]
task_bigram = gensim.models.phrases.Phrases(task_text, min_count=3, threshold=4)
task_trigram = gensim.models.phrases.Phrases(task_bigram[task_text], min_count=3, threshold=4)
task_text = [task_trigram[task_bigram[sent]] for sent in task_text]
print(task_text)

[['hi', 'jacob', 'how_are_you'], ['fine', 'nathan', 'how_are_you', 'joseph'], ['all', 'good', 'jacob', 'how_are_you', 'natalie'], ['okay', 'joseph', 'how_are_you', 'nikolas'], ['nothing', 'is', 'wrong', 'natalie', 'how_are_you', 'grian'], ['everything', 'is', 'pretty', 'good', 'nikolas', 'how_are_you', 'oliver'], ['fantastic', 'grian'], ['how_are_you', 'mathew'], ['it', 'never', 'been', 'better', 'oliver', 'how_are_you', 'minerva'], ['great', 'as', 'always', 'mathew', 'how_are_you', 'oleg']]


In [5]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
nlp = spacy.load('en', disable=['parser', 'ner'])

In [6]:
with open('politics.txt', 'r') as file:
    politics_text = file.read()
politics_text = nltk.sent_tokenize(politics_text)
politics_text = prepare_text(politics_text)
politics_bigram = gensim.models.phrases.Phrases(politics_text, min_count=3, threshold=10)
politics_text_bigram = [politics_bigram[sent] for sent in politics_text]
politics_allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
politics_lemmatized = [[token.lemma_ for token in nlp(" ".join(sentence)) \
                      if token.pos_ in politics_allowed_postags] for sentence in politics_text_bigram]

In [7]:
politics_LDA_dictionary = corpora.Dictionary(politics_lemmatized)
politics_LDA_corpus = [politics_LDA_dictionary.doc2bow(sent) for sent in politics_lemmatized]
politics_LDA_model = LdaMulticore(corpus=politics_LDA_corpus,
                         id2word=politics_LDA_dictionary,
                         num_topics=20, #количество тем
                         passes=10,
                         chunksize=100, # количество подсписков используемых в каждом проходе
                         iterations=100, 
                         gamma_threshold=0.001,
                         per_word_topics=True)
politics_LDA_model.save('politics_LDA_model.model')

In [8]:
politics_LDA_model.print_topics(-1)

[(0,
  '0.091*"police" + 0.078*"call" + 0.041*"hold" + 0.037*"action" + 0.030*"still" + 0.027*"figure" + 0.026*"measure" + 0.025*"rise" + 0.024*"aid" + 0.021*"debate"'),
 (1,
  '0.074*"say" + 0.037*"see" + 0.035*"want" + 0.030*"leader" + 0.030*"return" + 0.027*"government" + 0.021*"give" + 0.020*"however" + 0.019*"meet" + 0.019*"change"'),
 (2,
  '0.068*"say" + 0.036*"problem" + 0.031*"end" + 0.025*"hope" + 0.025*"long" + 0.023*"service" + 0.021*"follow" + 0.021*"limit" + 0.020*"much" + 0.017*"live"'),
 (3,
  '0.077*"election" + 0.048*"school" + 0.032*"go" + 0.030*"happen" + 0.025*"say" + 0.022*"back" + 0.021*"vote" + 0.021*"comment" + 0.020*"state" + 0.019*"labour"'),
 (4,
  '0.053*"year" + 0.038*"say" + 0.033*"world" + 0.033*"question" + 0.032*"concern" + 0.032*"political" + 0.025*"number" + 0.023*"run" + 0.021*"home" + 0.019*"casino"'),
 (5,
  '0.066*"think" + 0.045*"ask" + 0.042*"people" + 0.042*"come" + 0.036*"tell" + 0.035*"game" + 0.035*"force" + 0.028*"say" + 0.024*"well" + 0.0

In [10]:
text = "In my head, a review about the M. Video store chain has long been ripening. Since the very time when I again faced with the problem of returning money for a certificate of the Additional Services Program. I already wrote a review about the work of Eldorado, in fact, faced with the same problem, but for the first time. True, in Eldorado this lure is called the Additional Service Program, but the difference is small. Actually, to this day I think that M-video is a bad store, but due to the small selection of those in the city, sometimes I still get something there. pah-pah, while it works. However, today I want to share my opinion about M. Video. I will tell you about all the problems of this place. This is the second largest electronics store that we have in the city. A wide range of products is presented here, although it is often impossible to find a specific model. The store has promotions, discounts, a system of accumulative bonuses, credit purchases and other rubbish, which is called a service. However, there are many problems with all promotions. Once again, we got into such a bad situation. Firstly, the service in the store was terrible. And the quality of the goods was bad. Two years ago, they threw themselves off, threw themselves in and presented mom with a new TV set that had broken. It was very disappointing that the TV broke down, since we paid a huge amount for it. Not a single visit ended without a problem. However, when they bought Dad, they again drew this one and imposed the acquisition of a certificate of the Additional Services Program. So roughly (this is a copy) this stucco looks and costs decent money. The attraction is that at the end of the certificate validity period, money for unused services will be returned. But all this lies, with a certificate you get even more problems! Do not believe in stocks and do not go to stores with a bad reputation, do not create problems for yourself!"

In [11]:
t = [i for i in nltk.word_tokenize(text) if i not in string.punctuation and i not in stopwords.words('english') ]
nltk.FreqDist(t).most_common(5)

[('I', 6), ('store', 5), ('certificate', 4), ('bad', 4), ('problems', 4)]

In [12]:
Pushkin = '''
Мороз и солнце; день чудесный!
Еще ты дремлешь, друг прелестный —
Пора, красавица, проснись:
Открой сомкнуты негой взоры
Навстречу северной Авроры,
Звездою севера явись!

Вечор, ты помнишь, вьюга злилась,
На мутном небе мгла носилась;
Луна, как бледное пятно,
Сквозь тучи мрачные желтела,
И ты печальная сидела —
А нынче погляди в окно:

Под голубыми небесами
Великолепными коврами,
Блестя на солнце, снег лежит;
Прозрачный лес один чернеет,
И ель сквозь иней зеленеет,
И речка подо льдом блестит.
'''

Mayakovsky = '''
Послушайте!
Ведь, если звезды зажигают —
значит — это кому-нибудь нужно?
Значит — кто-то хочет, чтобы они были?
Значит — кто-то называет эти плевочки

жемчужиной?
И, надрываясь
в метелях полуденной пыли,
врывается к богу,
боится, что опоздал,
плачет,
целует ему жилистую руку,
просит —
чтоб обязательно была звезда! —
клянется —
не перенесет эту беззвездную муку!
А после
ходит тревожный,
но спокойный наружно.
Говорит кому-то:
«Ведь теперь тебе ничего?
Не страшно?
Да?!»
Послушайте!
Ведь, если звезды
зажигают —
значит — это кому-нибудь нужно?
Значит — это необходимо,
чтобы каждый вечер
над крышами
загоралась хоть одна звезда?!
'''

Lermontov = '''
Сквозь волнистые туманы
Пробирается луна,
На печальные поляны
Льет печально свет одна.
По дороге зимней, скучной
Тройка борзая бежит,
Колокольчик однозвучный
Утомительно гремит.
Что-то слышится родное
В долгих песнях ямщика:
То разгулье удалое,
То сердечная тоска.
'''


In [13]:
morph = pymorphy2.MorphAnalyzer()

In [14]:
def word_class(author):
    words = [i for i in nltk.word_tokenize(author) if i not in string.punctuation and i not in stopwords.words('russian') ]
    lst = []
    for i in words:
        word = morph.parse(i)[0]
        lst.append(word.tag.POS) 
    return nltk.FreqDist(lst).most_common(1)

In [15]:
word_class(Lermontov)

[('ADJF', 11)]