In [3]:
import sys
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.shape) 
df['content'].head(10)

(11314, 3)


0        From: lerxst@wam.umd.edu (where's my thing)\nS...
1        From: guykuo@carson.u.washington.edu (Guy Kuo)...
10       From: irwin@cmptrc.lonestar.org (Irwin Arnstei...
100      From: tchen@magnus.acs.ohio-state.edu (Tsung-K...
1000     From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...
10000    From: a207706@moe.dseg.ti.com (Robert Loper)\n...
10001    From: kimman@magnus.acs.ohio-state.edu (Kim Ri...
10002    From: kwilson@casbah.acns.nwu.edu (Kirtley Wil...
10003    Subject: Re: Don't more innocents die without ...
10004    From: livesey@solntze.wpd.sgi.com (Jon Livesey...
Name: content, dtype: object

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

3. Tokenize Sentences and Clean
Removing the emails, new line characters, single quotes



In [4]:
def sent_to_words(sentences):
  print(sentences)
  
  for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)
        
        sent = re.sub('\s+', ' ', sent)
        sent = re.sub("\'", "", sent)  
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  


data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


4.  lemmatize each word to its root form, keeping only nouns, adjectives, verbs and adverbs.




In [0]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  



5. Build the Topic Model



In [6]:

# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics= 20)

pprint(lda_model.print_topics())

[(0,
  '0.010*"organization" + 0.006*"host" + 0.006*"post" + 0.006*"nntp" + '
  '0.006*"disk" + 0.006*"university" + 0.005*"drive" + 0.005*"cec" + '
  '0.004*"write" + 0.004*"boot"'),
 (1,
  '0.011*"car" + 0.009*"organization" + 0.009*"article" + 0.008*"write" + '
  '0.006*"time" + 0.005*"bike" + 0.004*"thing" + 0.004*"state" + 0.004*"new" + '
  '0.004*"look"'),
 (2,
  '0.010*"people" + 0.008*"god" + 0.007*"write" + 0.005*"organization" + '
  '0.005*"believe" + 0.005*"question" + 0.005*"article" + 0.004*"mean" + '
  '0.004*"give" + 0.004*"israeli"'),
 (3,
  '0.011*"gordon" + 0.010*"bank" + 0.008*"organization" + 0.008*"article" + '
  '0.007*"write" + 0.006*"science" + 0.005*"reply" + 0.005*"helmet" + '
  '0.004*"gary" + 0.004*"dare"'),
 (4,
  '0.019*"space" + 0.005*"organization" + 0.005*"post" + 0.005*"nasa" + '
  '0.005*"scsi" + 0.004*"launch" + 0.004*"write" + 0.004*"mission" + '
  '0.004*"orbit" + 0.004*"article"'),
 (5,
  '0.015*"organization" + 0.012*"post" + 0.010*"university" +

In [0]:
# New Document to be classified

In [0]:
new_doc = "Looping correct email addresses of Shivali and Eshita! On Tue, 14 May 2019, 14:30 Nikita Bafna, <nikitabafna04@gmail.com> wrote: Hi Hannah,Hope you are well.Thank you for the updates.Could you please attach the map of that side of community, it seems you missed it.Also, can you clarify what is a downstairs unit? Regards,Nikita On Tue, May 14, 2019 at 12:48 AM Woodbridge Bloomington <WoodbridgeBloomington@glickco.com> wrote:Hi Nikita, Shivali, Neha and Eshita,I hope youre doing well! I wanted to let you know that weve begun looking over our apartments.Due to the high demand for two bedroom town home units, we only have one available option that fits your preferences.This apartment is located at 759 Woodbridge Drive, this is a downstairs unit end unit, and will be available to move into by July 31st.Ive included a map of that side of our community,  and will have to notify the residents of 759 that we have someone interested in their apartment, as well.Please let us know if this will work for you as soon as possible so that we can move forward with the next step.I look forward to hearing from you!"

In [0]:
def sent_to_words_new(sentences):
  print(sentences)
  sent = re.sub('\S*@\S*\s?', '', sentences) 
  sent = re.sub('\s+', ' ', sent)
  sent = re.sub("\'", "", sent)  
  sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
  yield(sent)  

def process_words_new(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out


def running_lda(new_doc):
  data_new_words = list(sent_to_words_new(new_doc))
  data_ready_new = process_words_new(data_new_words) 
  
  id2word_new = corpora.Dictionary(data_ready)
  corpus_new = [id2word.doc2bow(text) for text in data_ready_new]
  
  doc2topic_prob = lda_model.get_document_topics(corpus_new)
 
  topics = []
  prob = []
  for i in list(doc2topic_prob):
    for j in i:
      topics.append(j[0])
      prob.append(j[1])
  doc2topic= pd.DataFrame()
  doc2topic['topics'] = pd.Series(topics)
  doc2topic['prob'] = pd.Series(prob)
 
  topic = doc2topic.sort_values('prob',ascending=False).head(1)
  return(topic)

  
#running_lda(new_doc1)
#print(running_lda(new_doc1), 'dominat topic')



Text summarization:

In [0]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import regex as re 
def read_article(file_name):
    article = file_name.split(".")
    sentences = []

    for sent in article:
        sent = sent.lower()
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        #sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub("-" , " ", sent)
        sentences.append(sent.replace("[^a-zA-Z]", " ").split(" "))
    #sentences.pop() 
    
    return sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))

# let's begin
#generate_summary( new_doc, 2)

Example:



In [0]:
new_doc1 = " From abc@def Hockey fever has gripped Bhubaneswar city and the nation at large as 16 hockey playing nations will lock horns for World Cup glory starting November 28. The fever has reached the nook and corner of the country through 'My Heart Beats for Hockey 'campaign'. Not just matches, the global event will witness a star-studded opening ceremony with Bollywood icons Shahrukh Khan, Salman Khan, Madhuri Dixit and music maestro AR Rahman set to rock the stage. The opening ceremoby will start at 5:30 pm on Tuesday evening. Elaborate security arrangement has been made to avoid untoward incidents. Tickets for the opening ceremony and India matches have been sold out. The anticipation and excitement is palpable among hockey aficionados in the country and across the globe and Odisha government is also leaving no stones unturned to use the opportunity to boost tourist footfall in the state. The 14th edition of the mega event will witness 36 matches between 28 November and 16 December with World No 1 Australia as defending champions. The opening match of the tournament will see World No 3 Belgium taking on World No 11 Canada on November 28, 2018, while host nation India will open their campaign on the same day against South Africa.No doubt, Indian national team will be the favourites among the formidable opponents in front of its home crowd. The state has spent Rs 820 million to give build new infrastructure and mount branding and publicity campaigns in the run-up to the event."

In [10]:
generate_summary( new_doc1, 3)

Summarize Text: 
 .  tickets for the opening ceremony and india matches have been sold out.  the fever has reached the nook and corner of the country through my heart beats for hockey campaign


In [11]:
running_lda(new_doc1)

 From abc@def Hockey fever has gripped Bhubaneswar city and the nation at large as 16 hockey playing nations will lock horns for World Cup glory starting November 28. The fever has reached the nook and corner of the country through 'My Heart Beats for Hockey 'campaign'. Not just matches, the global event will witness a star-studded opening ceremony with Bollywood icons Shahrukh Khan, Salman Khan, Madhuri Dixit and music maestro AR Rahman set to rock the stage. The opening ceremoby will start at 5:30 pm on Tuesday evening. Elaborate security arrangement has been made to avoid untoward incidents. Tickets for the opening ceremony and India matches have been sold out. The anticipation and excitement is palpable among hockey aficionados in the country and across the globe and Odisha government is also leaving no stones unturned to use the opportunity to boost tourist footfall in the state. The 14th edition of the mega event will witness 36 matches between 28 November and 16 December with Wo

Unnamed: 0,topics,prob
7,16,0.272225
