# Word Frequency
- input: 
    - data4: dataframe with 'text' columns
- output: 
    - data4: added 'text_clearn' columns which is spacy tokens of 'text' columns

In [2]:
import pandas
data4=pandas.read_csv('../data/5.pulledTweet-deduplicated.csv')
data4.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,index,created_at,id,text,language,shingleSet,signature
219,219,227,251.0,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Sometimes you get a third chance Love s Thir...,en,"{1975646848, 3508350977, 2277555713, 187718236...","[18622085, 32776440, 21561465, 191860621, 9160..."
220,220,228,252.0,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Check out this Amazon deal The Art of My Neig...,en,"{622155522, 3082850439, 1706091273, 1593029644...","[116119196, 134412392, 15045377, 922378, 36390..."
221,221,229,253.0,Tue Apr 27 23:58:08 +0000 2021,1.387194e+18,First Steps How Upright Walking Made Us Hum...,en,"{2352757504, 1318699267, 463207690, 4013102741...","[77244016, 247586965, 343106086, 653196371, 20..."
222,222,230,254.0,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,The First Day of Spring by Nancy Tucker B...,en,"{3712246153, 2391777162, 2684049684, 401310274...","[8379671, 18718807, 281343041, 177753045, 3669..."
223,223,231,255.0,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,ad off Galaxy Star Projector ...,en,"{2473481, 4013102741, 566037670, 2455870758, 1...","[144950540, 122747773, 41607353, 97773221, 829..."


In [3]:
# Generate clean tokens for all documents

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) #just keep tagger for lemmatization, disable 'parser' and 'ner'

def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop | len(token.text) <= 2)

def spacy_tokenizer(text):
    for doc in nlp.pipe([str(text).lower()]):
        tokens = [token.lemma_ for token in doc if token_filter(token)]
    return tokens

data4['token'] = data4['text'].map(spacy_tokenizer) 
data4['text_clean'] = data4.apply(lambda x: " ".join(x['token']), axis=1)

data4.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,index,created_at,id,text,language,shingleSet,signature,token,text_clean
219,219,227,251.0,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Sometimes you get a third chance Love s Thir...,en,"{1975646848, 3508350977, 2277555713, 187718236...","[18622085, 32776440, 21561465, 191860621, 9160...","[sometimes, you, get, third, chance, , love,...",sometimes you get third chance love third c...
220,220,228,252.0,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Check out this Amazon deal The Art of My Neig...,en,"{622155522, 3082850439, 1706091273, 1593029644...","[116119196, 134412392, 15045377, 922378, 36390...","[check, out, this, amazon, deal, the, art, of,...",check out this amazon deal the art of my neigh...
221,221,229,253.0,Tue Apr 27 23:58:08 +0000 2021,1.387194e+18,First Steps How Upright Walking Made Us Hum...,en,"{2352757504, 1318699267, 463207690, 4013102741...","[77244016, 247586965, 343106086, 653196371, 20...","[ , first, step, how, upright, walking, make,...",first step how upright walking make we huma...
222,222,230,254.0,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,The First Day of Spring by Nancy Tucker B...,en,"{3712246153, 2391777162, 2684049684, 401310274...","[8379671, 18718807, 281343041, 177753045, 3669...","[ , the, first, day, of, spring, by, nancy, t...",the first day of spring by nancy tucker ...
223,223,231,255.0,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,ad off Galaxy Star Projector ...,en,"{2473481, 4013102741, 566037670, 2455870758, 1...","[144950540, 122747773, 41607353, 97773221, 829...","[ , off, , galaxy, star, proje...",off galaxy star projector w...


In [7]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_top_phrase(corpus, numPhrases=None, numWords=1):
    
    vec = CountVectorizer(ngram_range=(numWords, numWords), stop_words=nltk.corpus.stopwords.words('english')).fit(corpus.astype(str))
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:numPhrases]

common_words = get_top_phrase(data4['text_clean'], numPhrases=10,numWords=4)
pandas.DataFrame(common_words, columns = ['bigram' , 'count'])

Unnamed: 0,bigram,count
0,infinite wisdom infinite wisdom,9
1,wisdom infinite wisdom fight,9
2,infinite wisdom fight within,6
3,truth win suffice fight,5
4,win suffice fight meet,5
5,suffice fight meet vakeelsaabonprime,5
6,ring light bundle save,4
7,light bundle save promo,4
8,bundle save promo code,4
9,save promo code rfdk,4


# Text summarization using word frequency (extractive approach)

In [11]:
# Join all articles into a big string
big_doc=''
for i in range(len(data4)):
    big_doc+=str(data4.text[i])+'.'

In [13]:
# OPTION 1 #very slow for alot of big documents
from gensim.summarization import summarize 
%time
summarize(big_doc, ratio = 0.02)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.82 µs


'pack of LED RGB Flood Lights            Coupon PLUS Save     with promo code   TKHBLS    .What makes a place   Haunted   Award winning book by Kitty Janusz  Now on Amazon  amp  Kindle  .\nAnimal Crossing  New Horizons   Timmy  amp  Tommy   Nintendo Switch Lite Skin is       on Amazon   ad https .Check out this book   A Very Dangerous Woman  The Lives  Loves and Lies of Russia s Most Seductive Spy  by Deborah  .\nThe truth won t suffice  he will have to fight for it    Meet  VakeelSaabOnPrime     .Grab a   pack of LED Ring Lights  get it for         Save     with promo code   FUL     .Winning After Losing  Building Resilient Teams is an important book  Learn more    .Kenwood FP    Compact Food Processor   Silver And Grey        delivered   Amazon  a     Foodie  Foodies  food  .'

In [15]:
# OPTION 2
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

%time

sentences = sent_tokenize(big_doc)
sentences = [sentence for sentence in sentences if len(sentence) >= 30]
sentences = list(set(sentences))
freq_table = dict(get_top_phrase(data4['text'].astype(str),numWords=1))

def score_sentences(sentences, freq_table):
    sentence_scores = {}
    for sentence in sentences:
        word_count_in_sentence = len(word_tokenize(sentence))
        for word in freq_table:
            if word.lower() in sentence.lower():                
                if sentence in sentence_scores:
                    sentence_scores[sentence] += freq_table[word]
                else:
                    sentence_scores[sentence] = freq_table[word]
        sentence_scores[sentence] = sentence_scores[sentence] // word_count_in_sentence
    return sentence_scores
sentence_scores = score_sentences(sentences, freq_table)


def average_sentence_scores(sentence_scores):
    sum=0
    for key, value in sentence_scores.items():
        sum+=value
    return sum/len(sentence_scores)
print('Average sentence scores is: ',average_sentence_scores(sentence_scores),'. Triple it to use as threahold for summary funciton below?')

CPU times: user 5 µs, sys: 5 µs, total: 10 µs
Wall time: 23.1 µs
Average sentence scores is:  3.2945205479452055 . Triple it to use as threahold for summary funciton below?


In [21]:
def generate_summary(sentences, sentence_scores, threshold): # Use average_sentence_scores as clue for threshold
    sentence_count = 0
    summary = ''
    for sentence in sentences:
        if sentence in sentence_scores and sentence_scores[sentence] > threshold:
            summary += " " + sentence
            sentence_count += 1
    return summary 

generate_summary(sentences, sentence_scores, 6)

' Infinite Wisdom   Infinite Wisdom  II  The Fight Within . Shonen Jump  Undead Unluck Volume   pre orders are up at Rightstuf  Amazon  B amp N . Q  Revenue Growth  YoY   Change    Tesla  TSLA         Apple  AAPL         Facebook  FB         Amazon  AMZN         Goo . King  amp  Prince  CM                    Amazon         net  . Infinite Wisdom   Infinite Wisdom  II  The Fight Wit  . Ring Light Bundle for           Save     with promo code   RFDK           Seller MUST be  . Sonos Playbar Black Works with PS   amp  PS  via TV            Amazon USA  . Reader Ready Award Recommended Read        asmsg  iartg  amreading  bookboost https    . Monster Hunter Chibi Plush pre orders  Amazon    ad        Play Asia          .STEAL      pack of LED Video Photo Lighting Kit            Save     with promo code   FDUL     .'