In [1]:
import spacy
# import sentiment analysis 
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd 

#load vader sentiment

# nltk.download('vader_lexicon') #download if not installed
sentiment_analyzer = SentimentIntensityAnalyzer()

pd.set_option('display.max_columns', None)


In [34]:
 #gets entities
 def get_ents(doc,id):
    # ents_list] =[]
    ents_df = pd.DataFrame()
    for ent in doc.ents:
        this_ent = pd.DataFrame({'text' : [ent.text], 'label' : [ent.label_], 'comment_id': [id]})
        # ents_list.append(this_ent)
        ents_df = ents_df.append(this_ent)
    return ents_df


In [50]:

# gets entities for comments
def parse_comments(df):
    #create doc object
    all_ents = pd.DataFrame()
    for index,row in df.iterrows():
        doc = nlp(row['body'])
        comment_id = row['id']
        #get ents
        ents = get_ents(doc,comment_id)
        #add ids to df
        ents['post_id'] = row['submission_id']
        ents['subreddit']= row['subreddit']
        ents['is_post'] = False

        if  ents.empty ==False:
            all_ents = all_ents.append(ents)
    return all_ents

In [51]:
#gets entities for post titles
def parse_post_titles(df):
    #create doc object
    all_ents = pd.DataFrame()
    for index,row in df.iterrows():
        doc = nlp(row['title'])
        post_id = row['id']
        #get ents
        ents = get_ents(doc,post_id).rename(columns={'comment_id':'post_id'})
        #add ids to df
        ents['comment_id']= ''
        ents['subreddit']= row['subreddit']
        ents['is_post'] = True

        if  ents.empty ==False:
            all_ents = all_ents.append(ents)
    return all_ents

In [52]:
def parse_sentiment(df,type):
    #create doc object
    output = pd.DataFrame()

    for index,row in df.iterrows():
        #add ids to df
        if type == 'post':
            post_id = row['id']
            comment_id = ''
            doc = nlp(row['title'])
            is_post = True
        if type == 'comment':
            post_id = row['submission_id']
            comment_id = row['id']
            doc = nlp(row['body'])
            is_post = False
        
        this_output = pd.DataFrame(doc._.sentiment_score,index=[0])
        this_output['post_id'] = post_id
        this_output['comment_id'] = comment_id
        this_output['subreddit'] = row['subreddit']
        this_output['is_post'] = is_post


        if  this_output.empty == False:
            output = output.append(this_output)
    
    output.reset_index()
    return output



In [38]:
#define sentiment score for nlp pipeline
def sentiment_score(doc):
    def get_sentiment(doc):
        return sentiment_analyzer.polarity_scores(doc.text)

    spacy.tokens.Doc.set_extension('sentiment_score', getter=get_sentiment,force=True)
    return doc 


In [39]:

#load spacy english medium model
nlp = spacy.load("en_core_web_md")
nlp.add_pipe(nlp.create_pipe("merge_entities")) #merges entities so james brown = "james brown" and not "james" "brown"
nlp.add_pipe(sentiment_score, name="sentiment_score", last=True)

In [40]:
#read csvs

commentDF = pd.read_csv('../output/commentDF.csv')  

postsDF = pd.read_csv('../output/postsDF.csv')  



In [68]:
thisPost = postsDF.iloc[[1]]

thisPost

Unnamed: 0.1,Unnamed: 0,id,title,created_utc,score,subreddit,url,num_comments,selftext,stickied,spoiler,subreddit_subscribers,subreddit_type,subreddit_id,subreddit.1,total_awards_received,ups,downs,upvote_ratio,view_count,quarantine,removal_reason,removed_by_category,report_reasons,pinned,permalink,over_18,num_reports,num_duplicates,num_crossposts,num_comments.1,no_follow,media,media_embed,media_only,is_video,is_original_content,gilded,edited,category,banned_at_utc,archived
1,1,fww19w,Extraction | Official Trailer | Netflix - Chri...,2020-04-08 00:18:50,151,trailers,https://www.youtube.com/watch?time_continue=17...,21,,False,False,70132,public,t5_2qog5,trailers,0,151,0,0.96,,False,,,,False,/r/trailers/comments/fww19w/extraction_officia...,False,,0,0,21,False,{'oembed': {'provider_url': 'https://www.youtu...,"{'content': '<iframe width=""600"" height=""338"" ...",False,False,False,0,False,,,False


In [69]:

postCommentsDF = commentDF[commentDF["submission_id"] == thisPost['id'].values[0]]



In [70]:
post_ents_df = parse_post_titles(thisPost)
comment_ents_df = parse_comments(postCommentsDF)

entity_df = post_ents_df.append(comment_ents_df)
entity_df

Unnamed: 0,text,label,comment_id,post_id,subreddit,is_post
0,1 minute,TIME,fmqwx5o,fww19w,trailers,False
0,Netflix,ORG,fmqt2pp,fww19w,trailers,False
0,x200B,MONEY,fmqt2pp,fww19w,trailers,False
0,the Ryan Reynolds Fast,ORG,fmqt2pp,fww19w,trailers,False
0,60 seconds,TIME,fmqt2pp,fww19w,trailers,False
0,Hemsworth,PERSON,fn5yys1,fww19w,trailers,False
0,One,CARDINAL,fn5yys1,fww19w,trailers,False
0,23-25,DATE,fn5yys1,fww19w,trailers,False
0,sec,ORG,fn5yys1,fww19w,trailers,False
0,three,CARDINAL,fn5yys1,fww19w,trailers,False


In [71]:
#get sentiment 
post_sentiment_df = parse_sentiment(thisPost,'post')
comment_sentiment_df = parse_sentiment(postCommentsDF,'comment')
#apppend sentiment
sentiment_df = post_sentiment_df.append(comment_sentiment_df)
sentiment_df

Unnamed: 0,neg,neu,pos,compound,post_id,comment_id,subreddit,is_post
0,0.0,1.0,0.0,0.0,fww19w,,trailers,True
0,0.11,0.698,0.192,0.34,fww19w,fmqwx5o,trailers,False
0,0.0,0.568,0.432,0.7264,fww19w,fmrakzg,trailers,False
0,0.036,0.808,0.156,0.8343,fww19w,fmqt2pp,trailers,False
0,0.123,0.795,0.083,-0.296,fww19w,fmrkab4,trailers,False
0,0.0,1.0,0.0,0.0,fww19w,fms0ayc,trailers,False
0,0.0,1.0,0.0,0.0,fww19w,fmtvqly,trailers,False
0,0.0,1.0,0.0,0.0,fww19w,fmtqxp5,trailers,False
0,0.156,0.768,0.076,-0.6922,fww19w,fn5yys1,trailers,False
0,0.0,1.0,0.0,0.0,fww19w,fncwv4k,trailers,False


#Checking Examples:


In [72]:
list(thisPost['title'])

['Extraction | Official Trailer | Netflix - Chris Hemsworth & David Harbour']

In [74]:
comment_id = 'fmrkab4'
list(postCommentsDF[postCommentsDF['id']==comment_id]['body'])

['Is there anyone who hate the trailers that spoil the whole movie ? Or it’s just me. I feel like watch the movie when the trailer is over.']

In [75]:
doc = nlp(postCommentsDF[postCommentsDF['id']==comment_id]['body'].item())

In [76]:
print(doc.ents)
print(doc._.sentiment_score)

()
{'neg': 0.123, 'neu': 0.795, 'pos': 0.083, 'compound': -0.296}


In [14]:
#sentiment analysis 
#vader vs other:
# https://towardsdatascience.com/sentiment-analysis-beyond-words-6ca17a6c1b54

#spacy + vader 
# https://nlpforhackers.io/complete-guide-to-spacy/