In [2]:
import pandas as pd
import numpy as np
import nltk
from stop_words import get_stop_words
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

In [3]:
df = pd.read_json('https://s3.amazonaws.com/temp-data-pulls/newdump.json')

In [7]:
pd.options.mode.chained_assignment = None
rawpin_blog = df[(df["type"]=="pin") | (df["type"]=="blog post")]
rawpin_blog.drop(["has_spend"], axis = 1, inplace=True)
channel_info = rawpin_blog['channel_info'].apply(pd.Series)
channel_info.columns = ["channel", "info"]
content_info = rawpin_blog['content'].apply(pd.Series)
content_info.drop(['author_email', 'content', 'pinned_from'], axis=1, inplace=True) ## THESE HAVE ONLY NULLS
for x in content_info.columns:
    if "count" in x:
        content_info[x].fillna(np.NaN, inplace = True)
        #content_info[x] = content_info[x].astype(int)
master_pinblog = rawpin_blog.join(channel_info).join(content_info)
master_pinblog.drop(['channel_info', 'content'], axis = 1, inplace = True)
master_pinblog.columns = ['brand', 'engagement', 'uniqueid', 'impact', 'share_token', 'timestamp',
       'type', 'urls', 'channel', 'info', 'author_name', 'comment_count',
       'description', 'fb_likecount', 'fb_sharecount',
       'gplus_count', 'hashtags', 'image_url', 'like_count',
       'link', 'linkedin_sharecount', 'links', 'pin_id', 'pin_url',
       'pin_count', 'post_type', 'repin_count', 'summary',
       'thumbnail_url', 'title', 'tweet_count']

master_pinblog["links_count"] = master_pinblog['links'].str.len()

In [56]:
df_new = master_pinblog

In [57]:
df_new.link = df_new.link.astype(str)

In [67]:
def create_blogs_df(df):
    # create new df called blogs that only contains blogs
    blogs = df[df.type == 'blog post']
    blogs.reset_index(inplace = True)

    # converts link to string so we can split
    blogs.link = blogs.link.astype(str)
    # instantiate a new list called new_mag
    new_mag = []
    # list comprehension that just keeps part before '.com'
    # we can use list comprehension because this is true for all values
    magazine = [i.split('.com')[0] for i in blogs.link]
    # start for loop to get rid of everything before the name of the magazine
    for i in magazine:
        if '.' in i:
            new_mag.append(i.split('.')[1])
    # if there isn't a '.' it just sends the existing name to the list
        else:
            new_mag.append(i)
    # create new column for the blog df with the publications
    blogs['pub'] = new_mag
    
    return blogs

In [68]:
def publication_df(df, publication):
    pubs = df[df['pub'] == publication]
    return pubs

In [69]:
def lemmatizing(df, series, stop_words = True):
    lemmatizer = WordNetLemmatizer()
    en_stop = get_stop_words('en')
    # when a word is lemmatized, contractions are rightfully turned into different stems since 's = is
    # however, in reality, all of those words are themselves stop words, so I want to exclude them
    # question marks and the like are not helpful for our purpose of figuring out potential categories
    contractions = ["'s","s","'",".",",","n't","'d","ll","re","ve","``",
                    "''","”","“","’","(",")","?",":","t",";","d","!","-","[","]","w","#","m"]
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    post_text = [i for i in df[series]]
    count = 1
    print(f"Initializing tokenizer and lemmatizer ...")
    print("Number of posts tokenized and lemmatized:")
    for i in post_text:
        # clean and tokenize document string
        raw = i.lower()
        tokens = word_tokenize(raw)
        
        if stop_words:
            # stem tokens and remove stop words
            lemmed_tokens = [lemmatizer.lemmatize(i) for i in tokens if not i in en_stop]
        else:
            lemmed_tokens = [lemmatizer.lemmatize(i) for i in tokens]
        
        #remove stemmed contractions
        contracted_tokens = [i for i in lemmed_tokens if not i in contractions]

        # add tokens to list
        texts.append(contracted_tokens)
        if count % 5000 == 0:
            print(count)
        count += 1
    print("Lemmatizing Completed.")
    return texts

In [70]:
def remove_stop_words(list_ = lemmatized_titles):
    print('Removing stop words...')
    en_stop = get_stop_words('en')
    no_stop_words = [i for i in list_ if not i in en_stop]
    print('Stop Word Removal Complete.')
    return no_stop_words

In [71]:
def title_feature_extraction(df, lem_list = lemmatized_titles):
    print('Initializing title feature extraction...')
    print('Initializing word count for title length of **blogs**...')
    # need to tokenize and lemmatize to count the length
    stopped_titles = remove_stop_words(lem_list)
    # adding to dataframe
    df['title_length'] = [len(stopped_titles[i]) for i in range(len(stopped_titles))]

    print('Extracting other attributes from titles...')
    # the following code is a bunch of different feature extractions for the titles
    df['title_is_question'] = ['?' in i for i in df.title]
    df['title_contains_number'] = [any(x in i for x in ['1','2','3','4','5','6','7','8','9','0']) for i in df['title']]
    df['title_contains_best'] = ['best' in i.lower() for i in df.title]
    df['title_contains_sex'] = ['sex' in i.lower() for i in df.title]
    df['title_contains_now'] = ['now' in i.lower() for i in df.title]
    df['title_contains_new'] = ['new' in i.lower() for i in df.title]
    df['title_contains_episode'] = ['episode' in i.lower() for i in df.title]
    df['title_contains_how'] = ['how' in i.lower() for i in df.title]

    print('Title Feature Extraction Complete')
    full_df = df
    return full_df

In [74]:
# function to identify different groupings of words which count as topics
def topic_modeling(df = df, lem_list = lemmatized_titles,number_of_topics = 5,number_of_words = 30,number_of_passes = 3):
    print("Initializing Topic Modeling...")
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(lem_list)
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in lem_list]
    # generate LDA model
    print("Generating Model...")
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_of_topics, id2word = dictionary, passes=number_of_passes)
    topics = ldamodel.print_topics(num_topics=number_of_topics, num_words=number_of_words)
    print("Topics\n")
    for i in range(number_of_topics):
        print(f"Topic {topics[i][0]}: \n")
        print(topics[i][1], "\n")
    #return ldamodel[corpus]
    topic_vector = ldamodel[corpus]
    #return topic_vector
    print("Adding topic probabilities to DataFrame...")
    for j in range(number_of_topics):
        print(f'Adding Topic {j}...')
        df["Topic_{}".format(j)] = [topic_vector[i][j][1] if len(topic_vector[i]) == number_of_topics else np.NaN for i in range(len(topic_vector))]
    print("Topic Modeling Complete.")
    return df

In [73]:
df = create_blogs_df(df_new)
df = publication_df(df, 'glamour')
lemmatized_titles = lemmatizing(df, 'title', stop_words = False)
df = title_feature_extraction(df)
df = topic_modeling()

Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
Lemmatizing Completed.
Initializing title feature extraction...
Initializing word count for title length of **blogs**...
Removing stop words...
Stop Word Removal Complete.
Extracting other attributes from titles...
Title Feature Extraction Complete
Initializing Topic Modeling...
Generating Model...
Topics

Topic 0: 

0.021*"a" + 0.020*"her" + 0.016*"and" + 0.014*"is" + 0.013*"the" + 0.012*"on" + 0.011*"for" + 0.009*"to" + 0.009*"clinton" + 0.008*"with" + 0.008*"in" + 0.008*"new" + 0.008*"hillary" + 0.007*"just" + 0.006*"obama" + 0.006*"kardashian" + 0.005*"emma" + 0.005*"of" + 0.004*"blake" + 0.004*"ring" + 0.004*"little" + 0.004*"tatum" + 0.004*"engagement" + 0.004*"picture" + 0.004*"ryan" + 0.004*"recap" + 0.004*"she" + 0.004*"lively" + 0.004*"season" + 0.004*"day" 

Topic 1: 

0.046*"the" + 0.032*"and" + 0.021*"of" + 0.016*"best" + 0.015*"summer" + 0.012*"is" + 0.012*"for" + 0.012*"on" + 0