In [2]:
import pandas as pd
import numpy as np
import nltk
from stop_words import get_stop_words
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

In [3]:
df = pd.read_json('https://s3.amazonaws.com/temp-data-pulls/newdump.json')

In [7]:
pd.options.mode.chained_assignment = None
rawpin_blog = df[(df["type"]=="pin") | (df["type"]=="blog post")]
rawpin_blog.drop(["has_spend"], axis = 1, inplace=True)
channel_info = rawpin_blog['channel_info'].apply(pd.Series)
channel_info.columns = ["channel", "info"]
content_info = rawpin_blog['content'].apply(pd.Series)
content_info.drop(['author_email', 'content', 'pinned_from'], axis=1, inplace=True) ## THESE HAVE ONLY NULLS
for x in content_info.columns:
    if "count" in x:
        content_info[x].fillna(np.NaN, inplace = True)
        #content_info[x] = content_info[x].astype(int)
master_pinblog = rawpin_blog.join(channel_info).join(content_info)
master_pinblog.drop(['channel_info', 'content'], axis = 1, inplace = True)
master_pinblog.columns = ['brand', 'engagement', 'uniqueid', 'impact', 'share_token', 'timestamp',
       'type', 'urls', 'channel', 'info', 'author_name', 'comment_count',
       'description', 'fb_likecount', 'fb_sharecount',
       'gplus_count', 'hashtags', 'image_url', 'like_count',
       'link', 'linkedin_sharecount', 'links', 'pin_id', 'pin_url',
       'pin_count', 'post_type', 'repin_count', 'summary',
       'thumbnail_url', 'title', 'tweet_count']

master_pinblog["links_count"] = master_pinblog['links'].str.len()

In [154]:
df_new = master_pinblog

In [155]:
df_new.link = df_new.link.astype(str)

In [156]:
def create_medium_df(df,medium):
    # create new df called blogs that only contains blogs
    df = df[df['type'] == medium]
    df.reset_index(inplace = True)
    
    if medium == 'blog post':
        print('Creating {} DataFrame...'.format(medium))
        # converts link to string so we can split
        df['link'] = df['link'].astype(str)
        # instantiate a new list called new_mag
        new_mag = []
        # list comprehension that just keeps part before '.com'
        # we can use list comprehension because this is true for all values
        magazine = [i.split('.com')[0] for i in df['link']]
        # start for loop to get rid of everything before the name of the magazine
        for i in magazine:
            if '.' in i:
                new_mag.append(i.split('.')[1])
        # if there isn't a '.' it just sends the existing name to the list
            else:
                new_mag.append(i)
        # create new column for the blog df with the publications
        df['pub'] = new_mag
    else:
        print('Creating {} DataFrame...'.format(medium))
    
    return df

In [157]:
def publication_df(df, publication):
    pub_df = df[df['pub'] == publication]
    return pub_df

In [183]:
def lemmatizing(df, series, stop_words = True):
    lemmatizer = WordNetLemmatizer()
    en_stop = get_stop_words('en')
    # when a word is lemmatized, contractions are rightfully turned into different stems since 's = is
    # however, in reality, all of those words are themselves stop words, so I want to exclude them
    # question marks and the like are not helpful for our purpose of figuring out potential categories
    contractions = ["'s","s","'",".",",","n't","'d","ll","re","ve","``",
                    "''","”","“","’","(",")","?",":","t",";","d","!","-","[","]","w","#","m"]
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    post_text = [i for i in df[series]]
    count = 1
    print(f"Initializing tokenizer and lemmatizer ...")
    print("Number of posts tokenized and lemmatized:")
    for i in post_text:
        # clean and tokenize document string
        raw = i.lower()
        tokens = word_tokenize(raw)
        
        if stop_words == True:
            # stem tokens and remove stop words
            lemmed_tokens = [lemmatizer.lemmatize(i) for i in tokens if not i in en_stop]
        else:
            lemmed_tokens = [lemmatizer.lemmatize(i) for i in tokens]
        
        #remove stemmed contractions
        contracted_tokens = [i for i in lemmed_tokens if not i in contractions]

        # add tokens to list
        texts.append(contracted_tokens)
        if count % 5000 == 0:
            print(count)
        count += 1
    print("Lemmatizing Completed.")
    return texts

In [159]:
def remove_stop_words(list_):
    print('Removing stop words...')
    en_stop = get_stop_words('en')
    no_stop_words = [i for i in list_ if not i in en_stop]
    print('Stop Word Removal Complete.')
    return no_stop_words

In [160]:
def title_feature_extraction(df, series, lem_list, word_list):
    print('Initializing title feature extraction...')
    print('Initializing word count for title length...')
    # need to tokenize and lemmatize to count the length
    stopped_titles = remove_stop_words(lem_list)
    # adding to dataframe
    df['title_length'] = [len(stopped_titles[i]) for i in range(len(stopped_titles))]
    celeb = ['beyonce','kim','karsashian','taylor','swift','justin','bieber','rihanna','scarlet','johansson','dwayne','johnson',
    'ellen','degeneres','katy','perry','angelina','jolie','drake','brad','pitt','jay','cristiano','ronaldo','jennifer',
    'aniston','oprah','winfrey','adele','jonny','depp','tom','cruise','jennifer','lopez','sean','colms','jennifer','lawrence',
    'leonardo','dicaprio','sandra','bullock','selena','gomez','tom','hanks','julia','roberts','howard','stern','donald',
    'trump','robert','downey','britney','spears','adam','sandler','megan','fox','kylie','jenner','miley','cyrus','jessica',
    'alba','emma','watson','eminem','paris','hilton','vin','diesel','kevin','hart','will','smith','chris','rock',
    'chris','hemsworth','chris','pratt','ben','affleck','matt','damon','denzel','washington']

    print('Extracting other attributes from titles...')
    # the following code is a bunch of different feature extractions for the titles
    df['title_is_question'] = ['?' in i for i in df[series]]
    df['title_contains_number'] = [any(x in i for x in ['1','2','3','4','5','6','7','8','9','0']) for i in df[series]]
    df['title_contains_celeb'] = [any(x in i for x in celeb) for i in df[series]]
    for j in word_list:
        print(f'Creating column for {j}')
        df["title_contains_{}".format(j)] = [j in i.lower() for i in df[series]] 
    print('Title Feature Extraction Complete')
    full_df = df
    return full_df

In [187]:
# function to identify different groupings of words which count as topics
def topic_modeling(df, lem_list,number_of_topics = 5,number_of_words = 30,number_of_passes = 3):
    print("Initializing Topic Modeling...")
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(lem_list)
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in lem_list]
    # generate LDA model
    print("Generating Model...")
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_of_topics, id2word = dictionary, passes=number_of_passes)
    topics = ldamodel.print_topics(num_topics=number_of_topics, num_words=number_of_words)
    print("Topics\n")
    for i in range(number_of_topics):
        print(f"Topic {topics[i][0]}: \n")
        print(topics[i][1], "\n")
    #return ldamodel[corpus]
    topic_vector = ldamodel[corpus]
    #return topic_vector
    print("Adding topic probabilities to DataFrame...")
    for j in range(number_of_topics):
        print(f'Adding Topic {j}...')
        df["Topic_{}".format(j)] = [topic_vector[i][j][1] if len(topic_vector[i]) == number_of_topics else np.NaN for i in range(len(topic_vector))]
    print("Percetange Of Observations Missing Topic Values:")
    print(df['Topic_0'].isnull().sum()/df.shape[0]*100)
    return df

In [203]:
def early_pipeline(df_entry, medium_type, publication, list_of_words):
    df = create_medium_df(df_entry, medium_type)
    word_list = [i.lower() for i in list_of_words]
    lemmatized_titles = None
    try:
        if medium_type == 'blog post':
            df = publication_df(df, publication)
            lemmatized_titles = lemmatizing(df, 'title', stop_words = False)
            new_titles = lemmatizing(df, 'title', stop_words = True)
            df = title_feature_extraction(df, 'title', lemmatized_titles, word_list)
        elif medium_type == 'pin':
            lemmatized_titles = lemmatizing(df, 'description', stop_words = False)
            df = title_feature_extraction(df, 'description', lemmatized_titles, word_list)
            new_titles = lemmatizing(df, 'description', stop_words = True)
        df = topic_modeling(df, lem_list = new_titles)
        return df
    except:
        print("Invalid entries.\n Use either 'blog post' or 'pin'")

In [211]:
words = ['best','sex','now','new','episode','how']
df = early(df_new, medium_type = 'blog post', publication = 'glamour', list_of_words = words)

Creating blog post DataFrame...
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
Lemmatizing Completed.
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
Lemmatizing Completed.
Initializing title feature extraction...
Initializing word count for title length...
Removing stop words...
Stop Word Removal Complete.
Extracting other attributes from titles...
Creating column for best
Creating column for sex
Creating column for now
Creating column for new
Creating column for episode
Creating column for how
Title Feature Extraction Complete
Initializing Topic Modeling...
Generating Model...
Topics

Topic 0: 

0.021*"best" + 0.017*"2016" + 0.013*"hair" + 0.009*"beauty" + 0.009*"idea" + 0.009*"makeup" + 0.008*"outfit" + 0.008*"2017" + 0.008*"new" + 0.007*"hadid" + 0.006*"celebrity" + 0.006*"red" + 0.006*"look" + 0.006*"trend" + 0.006*"can" + 0.005*"wear" + 0.005*"product" + 0.005*"will" + 0.004*"day" + 0.0

In [150]:
words = ['best','sex','now','new','episode','how']
df_pin = early_pipeline(df_new, medium_type = 'pin', publication = None, list_of_words = words)

Creating pin DataFrame...
Initializing tokenizer and lemmatizer ...
Number of posts tokenized and lemmatized:
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
Lemmatizing Completed.
Initializing title feature extraction...
Initializing word count for title length of **         index   brand  engagement                            uniqueid  \
0            1  137299           0  MTM3Mjk5LTEzNzEzMDIxX3Bpbl8xNzQ4Mw   
1            5  137299           1  MTM3Mjk5LTEzNzEzMDIyX3Bpbl8xNzQ4Mw   
2            7  137299           0  MTM3Mjk5LTEzNzEzMDIzX3Bpbl8xNzQ4Mw   
3           11  137299           0  MTM3Mjk5LTEzNzEzMDI0X3Bpbl8xNzQ4Mw   
4           19  137326           4  MTM3MzI2LTEzNzEzNjQ3X3Bpbl8xNzQ4Mw   
5           21  137299           0  MTM3Mjk5LTEzNzEwMjMyX3Bpbl8xNzQ4Mw   
6           24  137299           0  MTM3Mjk5LTEzNzEwMjMzX3Bpbl8xNzQ4Mw   
7           29  137299    

Stop Word Removal Complete.
Extracting other attributes from titles...
Creating column for best
Creating column for sex
Creating column for now
Creating column for new
Creating column for episode
Creating column for how
Title Feature Extraction Complete
Initializing Topic Modeling...
Generating Model...
Topics

Topic 0: 

0.054*"a" + 0.051*"the" + 0.032*"in" + 0.027*"and" + 0.025*"of" + 0.023*"|" + 0.016*"archdigest.com" + 0.014*"by" + 0.012*"home" + 0.011*"with" + 0.010*"is" + 0.008*"room" + 0.007*"an" + 0.007*"on" + 0.006*"from" + 0.006*"kitchen" + 0.006*"this" + 0.006*"for" + 0.006*"new" + 0.006*"to" + 0.005*"at" + 0.005*"designer" + 0.004*"house" + 0.004*"design" + 0.003*"york" + 0.003*"are" + 0.003*"space" + 0.003*"master" + 0.003*"living" + 0.003*"feature" 

Topic 1: 

0.097*"see" + 0.071*"spring" + 0.068*"fall" + 0.066*"the" + 0.053*"-wmag" + 0.050*"collection" + 0.048*"from" + 0.028*"couture" + 0.016*"2017" + 0.014*"every" + 0.013*"show" + 0.011*"view" + 0.011*"complete" + 0.01

In [None]:
celeb = 
['beyonce','kim','karsashian','taylor','swift','justin','bieber','rihanna','scarlet','johansson','dwayne','johnson',
 'ellen','degeneres','katy','perry','angelina','jolie','drake','brad','pitt','jay','cristiano','ronaldo','jennifer',
 'aniston','oprah','winfrey','adele','jonny','depp','tom','cruise','jennifer','lopez','sean','colms','jennifer','lawrence',
 'leonardo','dicaprio','sandra','bullock','selena','gomez','tom','hanks','julia','roberts','howard','stern','donald',
 'trump','robert','downey','britney','spears','adam','sandler','megan','fox','kylie','jenner','miley','cyrus','jessica',
 'alba','emma','watson','eminem','paris','hilton','vin','diesel','kevin','hart','will','smith','chris','rock',
 'chris','hemsworth','chris','pratt','ben','affleck','matt','damon','denzel','washington']