In [1]:
import pandas as pd
import numpy as np
import nltk
from stop_words import get_stop_words
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

In [2]:
df = pd.read_json('https://s3.amazonaws.com/temp-data-pulls/newdump.json')

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [4]:
pd.options.mode.chained_assignment = None
rawpin_blog = df[(df["type"]=="pin") | (df["type"]=="blog post")]
rawpin_blog.drop(["has_spend"], axis = 1, inplace=True)
channel_info = rawpin_blog['channel_info'].apply(pd.Series)
channel_info.columns = ["channel", "info"]
content_info = rawpin_blog['content'].apply(pd.Series)
content_info.drop(['author_email', 'content', 'pinned_from'], axis=1, inplace=True) ## THESE HAVE ONLY NULLS
for x in content_info.columns:
    if "count" in x:
        content_info[x].fillna(np.NaN, inplace = True)
        #content_info[x] = content_info[x].astype(int)
master_pinblog = rawpin_blog.join(channel_info).join(content_info)
master_pinblog.drop(['channel_info', 'content'], axis = 1, inplace = True)
master_pinblog.columns = ['brand', 'engagement', 'uniqueid', 'impact', 'share_token', 'timestamp',
       'type', 'urls', 'channel', 'info', 'author_name', 'comment_count',
       'description', 'fb_likecount', 'fb_sharecount',
       'gplus_count', 'hashtags', 'image_url', 'like_count',
       'link', 'linkedin_sharecount', 'links', 'pin_id', 'pin_url',
       'pin_count', 'post_type', 'repin_count', 'summary',
       'thumbnail_url', 'title', 'tweet_count']

master_pinblog["links_count"] = master_pinblog['links'].str.len()
df_new = master_pinblog

In [5]:
df_new = master_pinblog

In [6]:
df_new.link = df_new.link.astype(str)
df_new.link.isnull().sum()

0

In [7]:
# lemmatizing give you the most basic form of a word
# this function differs from one further along in the code because this one
# keeps common english 
def lemmatizing_titles(pdseries):
    lemmatizer = WordNetLemmatizer()
    #en_stop = get_stop_words('en') if not i in en_stop
    # when a word is lemmatized, contractions are rightfully turned into different lems since 's = is
    # however, in this case contractions shorten a title's lenth, so we don't want to count them separately
    # question marks and the like shouldn't be counted as a separate word in a title
    contractions = ["'s","s","'",".",",","n't","'d","ll","re","ve","``",
                    "''","”","“","’","(",")","?",":","t",";","d","!","-","[","]","w","#","m"]
    #other_words = ["new","get",]
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    post_text = [i for i in pdseries]
    count = 1
    print(f"Initializing tokenizer and lemmatizer ...")
    print("Number of titles tokenized and lemmatized:")
    for i in post_text:
        # clean and tokenize document string
        raw = i.lower()
        tokens = word_tokenize(raw)

        # stem tokens and remove stop words
        lemmed_tokens = [lemmatizer.lemmatize(i) for i in tokens]

        #remove stemmed contractions
        contracted_tokens = [i for i in lemmed_tokens if not i in contractions]

        # add tokens to list
        texts.append(contracted_tokens)
        if count % 5000 == 0:
            print(count)
        count += 1
    print("Lemmatizing Completed.")
    return texts

In [8]:
# create new df called blogs that only contains blogs
blogs = df_new[df_new.type == 'blog post']
blogs.reset_index(inplace = True)

#create new df for pinterest
pins = df_new[df_new.type == 'pin']
pins.reset_index(inplace = True)

In [9]:
print('Initializing title feature extraction...')
print('Initializing word count for title length of **blogs**...')
# need to tokenize and lemmatize to count the length
lemmatized_titles = lemmatizing_titles(blogs.title)
# adding to dataframe
blogs['title_length'] = [len(lemmatized_titles[i]) for i in range(len(lemmatized_titles))]

print('Initializing word count for title length of **pins**...')
# doing the same for pinterest, those are titles in description
lemmatized_descriptions = lemmatizing_titles(pins.description)
# adding to dataframe
pins['title_length'] = [len(lemmatized_descriptions[i]) for i in range(len(lemmatized_descriptions))]

print('Extracting other attributes from titles...')
# the following code is a bunch of different feature extractions for the titles
blogs['title_is_question'] = ['?' in i for i in blogs.title]
pins['title_is_question'] = ['?' in i for i in pins.description]

blogs['title_contains_number'] = [any(x in i for x in ['1','2','3','4','5','6','7','8','9','0']) for i in blogs.title]
pins['title_contains_number'] = [any(x in i for x in ['1','2','3','4','5','6','7','8','9','0']) for i in pins.description]

blogs['title_contains_best'] = ['best' in i.lower() for i in blogs.title]
pins['title_contains_best'] = ['best' in i.lower() for i in pins.description]
blogs['title_contains_sex'] = ['sex' in i.lower() for i in blogs.title]
pins['title_contains_sex'] = ['sex' in i.lower() for i in pins.description]
blogs['title_contains_now'] = ['now' in i.lower() for i in blogs.title]
pins['title_contains_now'] = ['now' in i.lower() for i in pins.description]
blogs['title_contains_new'] = ['new' in i.lower() for i in blogs.title]
pins['title_contains_new'] = ['new' in i.lower() for i in pins.description]
blogs['title_contains_episode'] = ['episode' in i.lower() for i in blogs.title]
pins['title_contains_episode'] = ['episode' in i.lower() for i in pins.description]

print('Title Feature Extraction Complete')

Initializing title feature extraction...
Initializing word count for title length of **blogs**...
Initializing tokenizer and lemmatizer ...
Number of titles tokenized and lemmatized:
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
Lemmatizing Completed.
Initializing word count for title length of **pins**...
Initializing tokenizer and lemmatizer ...
Number of titles tokenized and lemmatized:
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
Lemmatizing Completed.
Extracting other attributes from titles...
Title Feature Extraction Complete


In [10]:
# converts link to string so we can split
blogs.link = blogs.link.astype(str)
# instantiate a new list called new_mag
new_mag = []
# list comprehension that just keeps part before '.com'
# we can use list comprehension because this is true for all values
magazine = [i.split('.com')[0] for i in blogs.link]
# start for loop to get rid of everything before the name of the magazine
for i in magazine:
    if '.' in i:
        new_mag.append(i.split('.')[1])
# if there isn't a '.' it just sends the existing name to the list
    else:
        new_mag.append(i)
# create new column for the blog df with the publications
blogs['pub'] = new_mag

# create a new subset of dataframes based on publication
df_blogs_glamour = blogs[blogs.pub == 'glamour']
#df_pins_glamour = pins[pins.pub == 'glamour']
df_blogs_teenvogue = blogs[blogs.pub == 'teenvogue']
#df_pins_teenvogue = pins[pins.pub == 'teenvogue']
df_blogs_wmagazine = blogs[blogs.pub == 'wmagazine']
#df_pins_wmagazine = pins[pins.pub == 'wmagazine']
df_blogs_allure = blogs[blogs.pub == 'allure']
#df_pins_allure = pins[pins.pub == 'allure']
df_blogs_cntraveler = blogs[blogs.pub == 'cntraveler']
#df_pins_cntraveler = pins[pins.pub == 'cntraveler']
df_blogs_architecturaldigest = blogs[blogs.pub == 'architecturaldigest']
#df_pins_architecturaldigest = pins[pins.pub == 'architecturaldigest']
df_blogs_vogue = blogs[blogs.pub == 'vogue']
#df_pins_vogue = pins[pins.pub == 'vogue']

print('Domains of Blogs:')
print(blogs.pub.unique())

Domains of Blogs:
['glamour' 'teenvogue' 'wmagazine' 'allure' 'cntraveler'
 'architecturaldigest' 'vogue']


In [11]:
def lemmatizing(pdseries):
    lemmatizer = WordNetLemmatizer()
    en_stop = get_stop_words('en')
    # when a word is lemmatized, contractions are rightfully turned into different stems since 's = is
    # however, in reality, all of those words are themselves stop words, so I want to exclude them
    # question marks and the like are not helpful for our purpose of figuring out potential categories
    contractions = ["'s","s","'",".",",","n't","'d","ll","re","ve","``",
                    "''","”","“","’","(",")","?",":","t",";","d","!","-","[","]","w","#","m"]
    #other_words = ["new","get",]
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    post_text = [i for i in pdseries]
    count = 1
    print(f"Initializing tokenizer and lemmatizer ...")
    print("Number of posts tokenized and lemmatized:")
    for i in post_text:
        # clean and tokenize document string
        raw = i.lower()
        tokens = word_tokenize(raw)

        # stem tokens and remove stop words
        lemmed_tokens = [lemmatizer.lemmatize(i) for i in tokens if not i in en_stop]

        #remove stemmed contractions
        contracted_tokens = [i for i in lemmed_tokens if not i in contractions]

        # add tokens to list
        texts.append(contracted_tokens)
        if count % 5000 == 0:
            print(count)
        count += 1
    print("Lemmatizing Completed.")
    return texts

In [None]:
# this stemmer didn't end up being used, but it's good code if you
# want to stem instead of lemmatize
# stemming give you the root of a word
# while lemmatizing give you the most basic form of a word
def stemming(pdseries):
    stemmer = SnowballStemmer("english",ignore_stopwords=True)
    en_stop = get_stop_words('en')
    # when a word is stemmed, contractions are rightfully turned into different stems since 's = is
    # however, in reality, all of those words are themselves stop words, so I want to exclude them
    # question marks and the like are not helpful for our purpose of figuring out potential categories
    contractions = ["'s","s","'",".",",","n't","'d","ll","re","ve","``",
                    "''","”","“","’","(",")","?",":","t",";","d","!","-","[","]","w","#","m"]
    #other_words = ["new","get",]
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    post_text = [i for i in pdseries]
    count = 1
    print(f"Initializing tokenizer and stemmer ...")
    print("Number of posts tokenized and stemmed:")
    for i in post_text:
        # clean and tokenize document string
        raw = i.lower()
        tokens = word_tokenize(raw)

        # stem tokens and remove stop words
        stemmed_tokens = [stemmer.stem(i) for i in tokens if not i in en_stop]

        #remove stemmed contractions
        contracted_tokens = [i for i in stemmed_tokens if not i in contractions]

        # add tokens to list
        texts.append(contracted_tokens)
        if count % 5000 == 0:
            print(count)
        count += 1
    print("Stemming Completed.")
    return texts

In [12]:
glamour = lemmatizing(df_blogs_glamour.title)
teenvogue = lemmatizing(df_blogs_teenvogue.title)
wmagazine = lemmatizing(df_blogs_wmagazine.title)
allure = lemmatizing(df_blogs_allure.title)
cntraveler = lemmatizing(df_blogs_cntraveler.title)
architecturaldigest = lemmatizing(df_blogs_architecturaldigest.title)
vogue = lemmatizing(df_blogs_vogue.title)

Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
15000
20000
25000
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
Lemmatizing Completed.


In [13]:
# function to identify different groupings of words which count as topics
def topic_modeling(pub,number_of_topics = 5,number_of_words = 30,number_of_passes = 1):
    print("Initializing:...")
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(pub)
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in pub]
    # generate LDA model
    print("Generating Model...")
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_of_topics, id2word = dictionary, passes=number_of_passes)
    topics = ldamodel.print_topics(num_topics=number_of_topics, num_words=number_of_words)
    print("Topics\n")
    for i in range(number_of_topics):
        print(f"Topic {topics[i][0]}: \n")
        print(topics[i][1], "\n")
    return ldamodel[corpus]

In [14]:
print("\n***GLAMOUR***")
topic_vector = topic_modeling(glamour,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
# a very few number of posts don't contain all 5 topics, which explains the 'if'statement within. Not worth building
# out separate code for so few exceptions
df_blogs_glamour['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_glamour['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_glamour['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_glamour['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_glamour['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_glamour.Topic_4.isnull().sum())

print("\n***TEENVOGUE***")
topic_vector = topic_modeling(teenvogue,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
df_blogs_teenvogue['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_teenvogue['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_teenvogue['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_teenvogue['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_teenvogue['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_teenvogue.Topic_4.isnull().sum())

print("\n***WMAGAZINE***")
topic_vector = topic_modeling(wmagazine,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
df_blogs_wmagazine['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_wmagazine['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_wmagazine['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_wmagazine['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_wmagazine['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_wmagazine.Topic_4.isnull().sum())

print("\n***ALLURE***")
topic_vector = topic_modeling(allure,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
df_blogs_allure['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_allure['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_allure['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_allure['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_allure['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_allure.Topic_4.isnull().sum())

print("\n***CNTRAVELER***")
topic_vector = topic_modeling(cntraveler,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
df_blogs_cntraveler['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_cntraveler['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_cntraveler['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_cntraveler['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_cntraveler['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_cntraveler.Topic_4.isnull().sum())

print("\n***ARCHITECTURALDIGEST***")
topic_vector = topic_modeling(architecturaldigest,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
df_blogs_architecturaldigest['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_architecturaldigest['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_architecturaldigest['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_architecturaldigest['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_architecturaldigest['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_architecturaldigest.Topic_4.isnull().sum())

print("\n***VOGUE***")
topic_vector = topic_modeling(vogue,number_of_passes = 3)
print("Adding topic probabilities to DataFrame...")
df_blogs_vogue['Topic_0'] = [topic_vector[i][0][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_vogue['Topic_1'] = [topic_vector[i][1][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_vogue['Topic_2'] = [topic_vector[i][2][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_vogue['Topic_3'] = [topic_vector[i][3][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
df_blogs_vogue['Topic_4'] = [topic_vector[i][4][1] if len(topic_vector[i]) == 5 else np.NaN for i in range(len(topic_vector))]
print("Number of Missing Values:")
print(df_blogs_vogue.Topic_4.isnull().sum())


***GLAMOUR***
Initializing:...
Generating Model...
Topics

Topic 0: 

0.017*"woman" + 0.009*"sex" + 0.008*"new" + 0.007*"will" + 0.007*"can" + 0.007*"thing" + 0.007*"day" + 0.007*"work" + 0.007*"season" + 0.007*"6" + 0.007*"get" + 0.006*"year" + 0.005*"make" + 0.005*"job" + 0.005*"7" + 0.005*"life" + 0.005*"5" + 0.005*"watch" + 0.005*"first" + 0.005*"wedding" + 0.005*"game" + 0.005*"much" + 0.005*"help" + 0.005*"beauty" + 0.005*"girl" + 0.005*"4" + 0.004*"gift" + 0.004*"8" + 0.004*"want" + 0.004*"recap" 

Topic 1: 

0.019*"trump" + 0.013*"woman" + 0.011*"donald" + 0.009*"clinton" + 0.008*"hillary" + 0.006*"talk" + 0.005*"show" + 0.005*"sex" + 0.005*"new" + 0.005*"movie" + 0.005*"picture" + 0.005*"netflix" + 0.005*"just" + 0.004*"president" + 0.004*"know" + 0.004*"obama" + 0.004*"hadid" + 0.004*"say" + 0.004*"get" + 0.004*"coming" + 0.004*"will" + 0.003*"'re" + 0.003*"birth" + 0.003*"watch" + 0.003*"first" + 0.003*"abortion" + 0.003*"gigi" + 0.003*"need" + 0.003*"like" + 0.003*"bill" 


Number of Missing Values:
0

***CNTRAVELER***
Initializing:...
Generating Model...
Topics

Topic 0: 

0.021*"flight" + 0.018*"$" + 0.016*"deal" + 0.016*"world" + 0.015*"new" + 0.011*"trip" + 0.008*"best" + 0.008*"u.s." + 0.007*"day" + 0.007*"hotel" + 0.006*"round-trip" + 0.006*"travel" + 0.006*"get" + 0.006*"airport" + 0.005*"fly" + 0.005*"cruise" + 0.005*"vacation" + 0.005*"secret" + 0.005*"road" + 0.005*"park" + 0.005*"..." + 0.004*"year" + 0.004*"now" + 0.004*"paris" + 0.004*"next" + 0.004*"choice" + 0.004*"nyc" + 0.004*"one" + 0.004*"recipe" + 0.004*"change" 

Topic 1: 

0.022*"2016" + 0.017*"travel" + 0.013*"beautiful" + 0.012*"will" + 0.010*"photo" + 0.008*"make" + 0.008*"place" + 0.008*"10" + 0.008*"world" + 0.007*"want" + 0.007*"passport" + 0.006*"plane" + 0.006*"traveler" + 0.006*"best" + 0.005*"hotel" + 0.005*"travelogue" + 0.005*"podcast" + 0.005*"never" + 0.005*"now" + 0.005*"new" + 0.005*"take" + 0.005*"visit" + 0.005*"gold" + 0.005*"japan" + 0.004*"get" + 0.004*"wear" + 0

In [None]:
# the code below helps to create a cool vizualiztion
# just make sure that you rename all of the models, corpuses, and dictionaries
# if you want to be able to see each one
# vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
# pyLDAvis.display(vis)

In [None]:
# the following code creates information about the most frequent words
# or the most frequent words that stand out from the rest
# to add these feature to the dataframe, just merge the dataframes

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tvec20 = CountVectorizer(max_features = 20,stop_words='english')
tfidf20  = pd.DataFrame(tvec20.fit_transform(blogs.title).todense(),columns=tvec20.get_feature_names())
print("Top 20 tf-idf:")
print(tfidf20.sum().sort_values(ascending=False))

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tvec40 = CountVectorizer(max_features = 40,stop_words='english')
tfidf40  = pd.DataFrame(tvec40.fit_transform(blogs.title).todense(),columns=tvec40.get_feature_names())
print("Top 40 tf-idf:")
print(tfidf40.sum().sort_values(ascending=False))

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
cvec40 = CountVectorizer(max_features = 40,stop_words='english')
vec40  = pd.DataFrame(cvec40.fit_transform(blogs.summary).todense(),columns=cvec40.get_feature_names())
print("Top 40 Most Used Words:")
print(vec40.sum().sort_values(ascending=False))