## Повторяем все операции с семинара

In [2]:
import os
os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = r'C:/mallet-2.0.8/bin/mallet'

In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this


import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [5]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')


['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


In [6]:
data = df.content.values.tolist()
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [9]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)]]


## Находим оптимальное количество топиков

In [12]:
def best_model(corpus,id2word):
    max_cv = 0
    for i in range(2,30):
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=i, id2word=id2word)
        coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
        print(coherence_ldamallet)
        if (coherence_ldamallet > max_cv):
            max_cv = coherence_ldamallet
            best_model = ldamallet
    print(max_cv)
    return(best_model)
ldamallet = best_model(corpus,id2word)

0.4749060688249831
0.49751566319703144
0.5174924325861209
0.511863488006274
0.512143276382778
0.4943555338443035
0.49112633373427544
0.5188622338174146
0.5172525517755194
0.5427794608635094
0.5482295047838456
0.5491876168591102
0.5599736398176908
0.5827577578993325
0.5791856061484795
0.5796617005645573
0.5869745531346334
0.5827073440238264
0.5777759728314438
0.5744897836092532
0.5790881892679023
0.5715443700372369
0.5822979659725986
0.5834493793206097
0.5868789182011089
0.5817085342729525
0.5687047801999927
0.5824936915474814
0.5869745531346334


## Вычисляем главный топик каждого документа

In [73]:
def dominant_topics(ldamodel=ldamallet, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
       row = sorted(row, key=lambda x: (x[1]), reverse=True)
       for j, (topic_num, prop_topic) in enumerate(row):
           if j == 0: # => dominant topic
              wp = ldamodel.show_topic(topic_num)
              topic_keywords = ", ".join([word for word, prop in wp])
              sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
           else:
              break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
df_topic_sents_keywords = dominant_topics(
   ldamodel=ldamallet, corpus=corpus, texts=texts)

In [75]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
   'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'
]

In [76]:
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,16.0,0.1318,"car, power, light, high, ground, engine, wire,...","[where, thing, car, nntp_poste, host, park, li..."
1,1,9.0,0.1912,"drive, system, card, problem, bit, driver, wor...","[si, poll, final, summary, final, call, si, cl..."
2,2,14.0,0.1474,"good, make, thing, time, bad, hear, write, lot...","[question, engineering, computer, network, dis..."
3,3,17.0,0.1125,"write, line, article, host, nntp_poste, organi...","[division, line, host, amber, write, write, ar..."
4,4,0.0,0.1283,"find, line, problem, call, time, book, number,...","[question, organization, smithsonian_astrophys..."
...,...,...,...,...,...
11309,11309,10.0,0.3302,"study, drug, result, effect, food, science, pr...","[migraine, city, ny_bis, reply, line, cheap, a..."
11310,11310,9.0,0.2037,"drive, system, card, problem, bit, driver, wor...","[problem, screen, blank, sometimes, minor, phy..."
11311,11311,16.0,0.1513,"car, power, light, high, ground, engine, wire,...","[este, mount, case, organization, mail, group,..."
11312,11312,0.0,0.2989,"find, line, problem, call, time, book, number,...","[line, nntp_poste, host, article, write, boy, ..."


## считаем tf_idf

In [78]:
import math
def computeIDF(documents):
    #для того, чтобы посчитать idf надо подать все документы в функцию сразу 
    #напишите функцию, считающую idf для каждого слова - на входе массив из numOfWords для всех текстов 
    #на выходе словарь для слов 
    #см. ввод и вывод ниже
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0], 0)
    for document in documents: # 
        for word in set(document):
            if word in idfDict.keys():
                idfDict[word] += 1
            else:
                idfDict[word] = 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [79]:
def find_tf_idf (doc,idf_dict):
    tfDict = {}
    un = set(doc)
    num_of_words = dict.fromkeys(un, 0)
    for word in doc:
        num_of_words[word] += 1
    word_num = len(doc)
    for word, count in num_of_words.items():
        tfDict[word] = count / float(word_num)
    tf_idf_dict = {}
    for word in tfDict.keys():
        tf_idf_dict[word] = tfDict[word]*idf_dict[word]
    sorted_tuples = sorted( tf_idf_dict.items(), key=lambda item: item[1],reverse = True)
    return(sorted_tuples[0:3])


In [80]:
fin_list = []
for topic in range(0,18):
    test_docs = df_dominant_topic[df_dominant_topic['Dominant_Topic']==topic][['Document_No','Text']]
    idf_dict =computeIDF(list(test_docs['Text']))
    for row in test_docs.itertuples():  
        fin_list.append([row[1],find_tf_idf(row[2],idf_dict)])

In [81]:
column_names=['doc_id','top_tf_idfs']
fin_df = pd.DataFrame(fin_list,columns = column_names)

Результат датафрейм с 3 наибольшими tf_idf для каждого документа

In [83]:
fin_df

Unnamed: 0,doc_id,top_tf_idfs
0,4,"[(error, 0.16969958012429018), (warn, 0.121022..."
1,86,"[(ctrltest, 0.11301080502751795), (assumption,..."
2,125,"[(rotfl, 0.17925851831951123), (second, 0.1016..."
3,175,"[(font, 0.2962380488728496), (symptom, 0.25615..."
4,188,"[(phone, 0.19721471733119506), (operator, 0.16..."
...,...,...
11309,11230,"[(burning, 0.4977046781026118), (verdict, 0.41..."
11310,11232,"[(laboratory, 0.42579793682926326), (slow, 0.4..."
11311,11241,"[(business, 0.5303447783699939), (answer, 0.42..."
11312,11256,"[(loopback_connector, 0.44793421029235064), (p..."
