In [1]:
import spacy
import pandas as pd 


#load spacy english medium model
nlp = spacy.load("en_core_web_md")
pd.set_option('display.max_columns', None)

In [2]:
 def get_ents(doc,id):
    # ents_list] =[]
    ents_df = pd.DataFrame()
    for ent in doc.ents:
        this_ent = pd.DataFrame({'text' : [ent.text], 'label' : [ent.label_], 'comment_id': [id]})
        # ents_list.append(this_ent)
        ents_df = ents_df.append(this_ent)
    return ents_df


In [3]:


def parse_comments(df):
    #create doc object
    all_ents = pd.DataFrame()
    for index,row in df.iterrows():
        doc = nlp(row['body'])
        comment_id = row['id']
        #get ents
        ents = get_ents(doc,comment_id)
        #add ids to df
        ents['post_id'] = row['submission_id']
        ents['subreddit']= row['subreddit']
        if  ents.empty ==False:
            all_ents = all_ents.append(ents)
    return all_ents

In [4]:
def parse_post_titles(df):
    #create doc object
    all_ents = pd.DataFrame()
    for index,row in df.iterrows():
        doc = nlp(row['title'])
        post_id = row['id']
        #get ents
        ents = get_ents(doc,post_id).rename(columns={'comment_id':'post_id'})
        #add ids to df
        ents['comment_id']= ''
        ents['subreddit']= row['subreddit']
        if  ents.empty ==False:
            all_ents = all_ents.append(ents)
    return all_ents

In [5]:
#read csvs

commentDF = pd.read_csv('../output/commentDF.csv')  

postsDF = pd.read_csv('../output/postsDF.csv')  



In [6]:
thisPost = postsDF.iloc[[0]]

thisPost

Unnamed: 0.1,Unnamed: 0,id,title,created_utc,score,subreddit,url,num_comments,selftext,stickied,spoiler,subreddit_subscribers,subreddit_type,subreddit_id,subreddit.1,total_awards_received,ups,downs,upvote_ratio,view_count,quarantine,removal_reason,removed_by_category,report_reasons,pinned,permalink,over_18,num_reports,num_duplicates,num_crossposts,num_comments.1,no_follow,media,media_embed,media_only,is_video,is_original_content,gilded,edited,category,banned_at_utc,archived
0,0,ftfwrt,Train to Busan Presents: Peninsula - 4 years a...,2020-04-02 04:31:09,166,trailers,https://www.youtube.com/watch?v=yVucSRLLeIM,11,,False,False,70132,public,t5_2qog5,trailers,0,166,0,0.99,,False,,,,False,/r/trailers/comments/ftfwrt/train_to_busan_pre...,False,,10,0,11,False,"{'type': 'youtube.com', 'oembed': {'provider_u...","{'content': '<iframe width=""600"" height=""338"" ...",False,False,False,0,False,,,False


In [7]:

postCommentsDF = commentDF[commentDF["submission_id"] == thisPost['id'].values[0]]



In [8]:
parse_post_titles(thisPost)

Unnamed: 0,text,label,post_id,comment_id,subreddit
0,South Korea,GPE,ftfwrt,,trailers
0,Jung-seok,PERSON,ftfwrt,,trailers
0,two,CARDINAL,ftfwrt,,trailers


In [9]:
thisPost['title'][0]

'Train to Busan Presents: Peninsula - 4 years after South Korea’s total decimation in Train to Busan, Jung-seok, a soldier who previously escaped the diseased wasteland, relives the horror when assigned to a covert operation with two simple objectives: retrieve and survive.'

In [10]:
test  = parse_comments(postCommentsDF)

In [11]:
test.head(100)

Unnamed: 0,text,label,comment_id,post_id,subreddit
0,the 1st,DATE,fm8a9yb,ftfwrt,trailers
0,L.A,GPE,fm92ox4,ftfwrt,trailers
0,Max,PERSON,fm92ox4,ftfwrt,trailers
0,busan,GPE,fm6x79r,ftfwrt,trailers
0,Busan,GPE,fm9snvi,ftfwrt,trailers
0,Korean,NORP,fm9snvi,ftfwrt,trailers
0,Netflix,PRODUCT,fm9snvi,ftfwrt,trailers
0,Busan,GPE,fm7jqrm,ftfwrt,trailers
0,Aliens,PERSON,fm7jqrm,ftfwrt,trailers


In [12]:
list(thisPost['title'])

['Train to Busan Presents: Peninsula - 4 years after South Korea’s total decimation in Train to Busan, Jung-seok, a soldier who previously escaped the diseased wasteland, relives the horror when assigned to a covert operation with two simple objectives: retrieve and survive.']

In [13]:
list(postCommentsDF[postCommentsDF['id']=='fm7jqrm']['body'])

['I have to admit, it is missing that confined, very personal feel that Train to Busan had. Then again, it could be the Aliens to what Alien was. We will just have to see if they expanded the world while properly executing a good story.']

In [14]:
#sentiment analysis 
#vader vs other:
# https://towardsdatascience.com/sentiment-analysis-beyond-words-6ca17a6c1b54

#spacy + vader 
# https://nlpforhackers.io/complete-guide-to-spacy/

In [15]:
import spacy
from spacy.tokens import Doc
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')

sentiment_analyzer = SentimentIntensityAnalyzer()

def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)

Doc.set_extension('polarity_scores', getter=polarity_scores)






[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/devinsmacbookpro/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [52]:
text = postCommentsDF['body'][10]
# list(postCommentsDF[postCommentsDF['id']=='fm7jqrm']['body'])
text



'I have to admit, it is missing that confined, very personal feel that Train to Busan had. Then again, it could be the Aliens to what Alien was. We will just have to see if they expanded the world while properly executing a good story.'

In [50]:
sentiment_analyzer.polarity_scores(text)


{'neg': 0.047, 'neu': 0.853, 'pos': 0.1, 'compound': 0.3612}

In [None]:
# not tagging sentiment very well, will try stanford model in the link below
# https://towardsdatascience.com/sentiment-analysis-beyond-words-6ca17a6c1b54