# Project 3: Reddit NLP - Sentiment Analysis

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# I really like this color, so it will be used for everything
DO = '#7D1B7E'

%matplotlib inline
plt.style.use('fivethirtyeight')

Since the logistic regression classifier returned the best metrics and its top features are interpretable, I am going to investigate the sentiments associated with its top features within the context of the post. I will utilize VADER's sentiment analysis to collect the average positive or negative sentiment metric associated with each feature word.

Loading dataframe, stop words, train/test split datasets:

In [2]:
! ls

01_Data_Collection.ipynb         03B_Modeling_Naive_Bayes.ipynb
02_EDA.ipynb                     03C_Modeling_Random_Forest.ipynb
03A_Modeling_Log_Reg.ipynb       04_Sentiment_Analysis.ipynb


In [3]:
with open('../assets/df.pkl','rb') as f:
    df = pickle.load(f)
with open('../assets/log_reg_beta.pkl', 'rb') as f:
    log_beta = pickle.load(f)

In [4]:
df = df[['selftext','dating']]
df.head()

Unnamed: 0,selftext,dating
0,The beginning We have been in a monogamous r...,0
1,We have been together months It was a norma...,0
2,Im a year old college student never been o...,1
3,Gf and I both been dating for months and...,0
4,Ok some background Bf and I have been togeth...,0


The following dataframe contains the top words in determining the likelihood of a post being from the dating subreddit:

In [5]:
log_beta = log_beta[['log_odds']]

In [6]:
top_date = log_beta.head(20)
top_date

Unnamed: 0_level_0,log_odds
top_words,Unnamed: 1_level_1
date,1.519875
dating,1.514506
dates,1.491353
christmas,1.347943
women,1.30537
met,1.273777
place,1.26236
guy,1.252815
interested,1.231349
guys,1.228484


The following dataframe contains the top words in determining the likelihood of a post being from the relationship advice subreddit:

In [7]:
top_relationship = log_beta.tail(20).sort_values('log_odds')
top_relationship

Unnamed: 0_level_0,log_odds
top_words,Unnamed: 1_level_1
my boyfriend,0.703194
my girlfriend,0.796085
started dating,0.807725
sex,0.827784
argument,0.828984
girlfriend,0.829486
cheating,0.829693
husband,0.83296
my bf,0.83662
my wife,0.841608


I am going to instantiate the Sentiment Intensity Analyzer and it will return 4 sentiment metrics when it is applied to each post:

`compound` - ranges between -1 (most extreme negative) to +1 (most extreme positive


`positive` - a percentage rating of how positive a post is 


`neutral` - a percentage rating of how neutral a post is 


`negative` - a percentage rating of how negative a post is 

In [8]:
sia = SentimentIntensityAnalyzer()

In [9]:
df.columns.str.contains('t')

array([ True,  True])

 I WANT HIGHEST POS 
 I WANT LOWEST NEG
 I WANT TOTAL CCOUNT OF POSTS
 I WANT AVERAGE METRICS
 
 

In [10]:
np.mean([1,2,3])

2.0

In [11]:
z = [1,2,3]
z.append(5)

In [12]:
z

[1, 2, 3, 5]

In [13]:
def sentiment_analyzer_steroids():
    
    post_counter = 0
    
    negative_scores = []
    neutral_scores = []
    positive_scores = []
    compound_scores = []
    
    sia = SentimentIntensityAnalyzer()
    
    print('Which subreddit do you want?')
    subreddit = input("You can choose either 'dating' or 'relationship_advice': ")
    
    if subreddit == 'dating':
        
        print("\nThe top dating words are:")
        print(list(top_date.index))
        
        word = input("\nWhich word do you want to perform sentiment analysis on? ")
        
        for post in df[df['dating'] == 1]['selftext'].values[:]:
            if word in post:
                
                post_counter += 1
                
                compound_scores.append(sia.polarity_scores(post)['compound'])
                positive_scores.append(sia.polarity_scores(post)['pos'])
                neutral_scores.append(sia.polarity_scores(post)['neu'])
                negative_scores.append(sia.polarity_scores(post)['neg'])
                
    elif subreddit == 'relationship_advice':
        
        print("\nThe top relationship_advice words are:")
        print(list(top_relationship.index))
        
        word = input("\nWhich word do you want to perform sentiment analysis on? ")
        
        for post in df[df['dating'] == 1]['selftext'].values[:]:
            if word in post:
                
                post_counter += 1
                
                compound_scores.append(sia.polarity_scores(post)['compound'])
                positive_scores.append(sia.polarity_scores(post)['pos'])
                neutral_scores.append(sia.polarity_scores(post)['neu'])
                negative_scores.append(sia.polarity_scores(post)['neg'])       
            
    if subreddit == 'dating':
        percentage = round(post_counter/len(df[df['dating'] == 1]),2)
    
    elif subreddit == 'relationship_advice':
        percentage = round(post_counter/len(df[df['dating'] == 0]),2)
    
    print(f"\n{post_counter} posts in the {subreddit} subreddit contain the word '{word}'")
    print(f"\n{percentage} of the posts in the {subreddit} subreddit contain the word '{word}'")
    
    sentiment_dict = {'Average Compound': round(np.mean(compound_scores),2),
                      'Average Positive': round(np.mean(positive_scores),2),
                      'Average Neutral': round(np.mean(neutral_scores),2),
                      'Average Negative': round(np.mean(negative_scores),2)
                     }

    print(f'\nThe sentiment metrics for the word {word}:\n{sentiment_dict}')
                

In [14]:
sentiment_analyzer_steroids()

Which subreddit do you want?
You can choose either 'dating' or 'relationship_advice': dating

The top dating words are:
['date', 'dating', 'dates', 'christmas', 'women', 'met', 'place', 'guy', 'interested', 'guys', 'shy', 'apps', 'girl', 'tinder', 'bumble', 'emotionally', 'looking', 'single', 'seemed', 'eventually']

Which word do you want to perform sentiment analysis on? shy

399 posts in the dating subreddit contain the word 'shy'

0.05 of the posts in the dating subreddit contain the word 'shy'

The sentiment metrics for the word shy:
{'Average Compound': 0.59, 'Average Positive': 0.15, 'Average Neutral': 0.76, 'Average Negative': 0.09}
