# USC CKIDS Datafest 2022 
## Characterizing Online Attitudes, Expectations, and Concerns about Novel Medical Treatments

**Source of Data:**<br>
Manually serched in reddit for topics regarding male birth control, and downloaded the post/thread submissions with the comments in it. Also, downloaded the user history of those who commented in it. <br>
**Data files:** <br>
submissions : 74 posts/threads (in .pkl files) <br>
users: 21627 user history of those who commented on the submissions (in .pkl files) <br>
**Data exploration:** <br>
Histogram of users commenting in more than one reddit submission. 
Vader Sentiment analysis of comments for each submission.
Box plot and trend line for overall sentiment over time. 



In [None]:
import pickle
import os
import matplotlib.pyplot as plt
import numpy as np
from praw.models import MoreComments
from collections import defaultdict, Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime as dt
import pandas as pd
import seaborn as sns
import datetime as dt
from matplotlib.backends.backend_pdf import PdfPages

# import and read reddit submissions .pkl (pickle) files
directory = '.../DATAFEST_2022/submissions/' # change directory
submissions = []
count = 0
for file in os.listdir (directory):
    if file.endswith('.pkl'):
        with open (directory + file, 'rb') as f:
            submissions.append(pickle.load(f))
            count += 1   
print ('File count:', count)
    

In [None]:
# reddit data in PRAW: The Python Reddit API Wrapper
# https://praw.readthedocs.io/en/stable/index.html

class Submission:

    def __init__(self,sub):
        self.sub = sub # list of Reddit submissions
        self.posts = list() # list of comments
        self.pos_str = list()
        self.neg_str = list()

    def comments (self): # read all comments in each submission
        commentors = set([])
        for comment in self.sub.comments.list():
            # append commentor's username, date and time, and comment text
            self.posts.append([['username:', comment.author],dt.datetime.utcfromtimestamp(comment.created), comment.body])
            commentors.add(str(comment.author))
        return commentors
    
    def dataframe (self): # convert list into dataframe format
        self.dframe = pd.DataFrame(self.posts)
    
    def VaderSentiment(self): # sentiment analysis for each comment
        self.analyzer = SentimentIntensityAnalyzer()
        # Sentiment analysis calculates positive, negative, neutral, and compound
        self.sentiment = self.dframe[2].apply(lambda x: self.analyzer.polarity_scores(x)['compound'])
        self.dframe = pd.concat([self.dframe,self.sentiment.apply(pd.Series)],axis=1)
        return (self.sentiment)
    
    def list_str(self): #gets string of comments of negative and positive sentiment
        for index, row in self.dframe.iterrows():
            if row.iloc[-1] >= 0.9:
                self.pos_str.append(row[2])
            if row.iloc[-1] <= -0.9:
                self.neg_str.append(row[2])
    
    def describe(self): # returns mean, min, max, and IQR of sentiment for each submission. 
        self.average=self.dframe.describe()
        return (self.average)



In [None]:
#get the sentiment based on the title, and sending the submission data to functions.

dates = list()
all_sub = dict()
data_dict = dict()
pos_str = list()
neg_str = list()

#from the list of submissions, read each submission with no limit of comments
for sub in submissions:
    sub.comments.replace_more (limit = None)
    
    if sub.num_comments > 0: # one submission had no comment, and was excluded. As long it has a comment. 
        all_sub[sub.title] = dict()
        all_sub[sub.title]['Title'] = sub.title # submission title
        all_sub[sub.title]['Time'] = dt.datetime.utcfromtimestamp(sub.created) # time of creation of submission
        date = dt.datetime.utcfromtimestamp(sub.created)
        all_sub[sub.title]['Vader'] = list()
        
        sub.id = Submission(sub)
        
        user = list(sub.id.comments())
        all_sub[sub.title]['users'] = user #Key = submission title; value = usernames
        sub.id.dataframe()
        vader = sub.id.VaderSentiment()

        # collect list of strings of negative and positive comments
        data_dict[date] = vader
        
        sub.id.list_str()
        pos_str.append(sub.id.pos_str)
        neg_str.append(sub.id.neg_str)
        
        all_sub[sub.title]['Vader'] = sub.id.describe()



In [None]:
# dictionary of frequency of usernames in all 74 submissions
user = dict()
for k,v in all_sub.items():
    for u in v['users']:
        if u != 'None': # not including usersnames that was deleted or None
            user[u] = user.get(u, 0) + 1
#print (len (user))

# users in 2, 3, 4, 5, 6 submissions
# frequency of usernames in multiple posts
user2, user3, user4, user5, user6 = list(), list(),list(),list(),list()
one = 0
for i in user:
    if user[i] == 6: user6.append(i)
    elif user[i] == 5: user5.append(i)
    elif user[i] == 4: user4.append(i)
    elif user[i] == 3: user3.append(i)
    elif user[i] == 2: user2.append(i)
    else:
        one += 1

print  (len(user6), len(user5),len(user4),len(user3),len(user2))      
print (one + len(user6)+len(user5)+len(user4)+len(user3)+(len(user2)))

# https://stackoverflow.com/questions/39841733/matplotlib-histogram-how-to-display-the-count-over-the-bar
# https://python-course.eu/numerical-programming/histograms-with-matplotlib.php

freq_lst = list(user.values())

sns.set(rc={'figure.figsize': (20,10),'font.size':20}) 
n, bins, patches = plt.hist(x = freq_lst, edgecolor = 'black', bins = [1,2,3,4,5,6,7], align = 'left' )
plt.bar_label(patches, fontsize = 15)

plt.xticks (bins[0:6], fontsize= 'small')

plt.grid(axis = 'x')
plt.title ('Frequency of Users commenting in multiple Reddit submissions', fontsize = 'large')
plt.ylabel ('Number of Users', fontsize = 'medium')
plt.xlabel ('Number of Submissions', fontsize = 'medium')
plt.yscale('log')

#plt.savefig ("users_subsv2.png")
plt.show()

counter = dict()
for i in freq_lst:
    counter[i] = counter.get(i,0) + 1
print (counter)

In [None]:
# from data dictionary (key is data and time; value is sentiment value)
# sort keys based on chronological order

order = list()
for k,v in sorted(data_dict.items()):
    order.append(k)
#print (order)

sort_data = dict()
for k,v in sorted(data_dict.items()):
    sort_data[k] = v

sort_df = pd.DataFrame(sort_data)
#sort_df

In [None]:
# create table of date and title of submission
ddd = list()
for k,v in sorted (all_sub.items()):
    ddd.append([v['Time'],k])
    
    
dd = pd.DataFrame(ddd)
display (dd)

In [None]:
# get summary of sentiment analysis
des = (sort_df.describe())
display(des)
#des.to_csv("sentiment_mean.csv")

avg= []
for i in order:
    avg.append(des[i]['mean'])

In [None]:
# box plot of each submission in chronological order
# plot of the means on each submission and trend line.

sns.set_style ("ticks")
sns.set(rc={'figure.figsize': (35,20), 'xtick.bottom': True})
ax = sns.stripplot (order = order, jitter = True, size = 5, alpha = 0.5, linewidth =1, data = sort_df)
ax = sns.boxplot(order=order, showfliers=True, linewidth=0.8, showmeans=True, data=sort_df)
ax = sns.pointplot(order=order, data=sort_df, ci=None, color='black')

plt.xlabel("Date & Time", fontsize = 30)
plt.ylabel("Sentiment Score", fontsize = 30)
labels = ax.axes.get_xticklabels()
ax = ax.axes.set_xticklabels(labels, rotation=45)

# Trend line
xs = range (0,73)
z = np.polyfit(xs,avg,1)
p = np.poly1d(z)
ax = plt.plot(xs,p(xs),color = 'Red', lw=3)

plt.title("Sentiment Analysis over time", fontsize = 50, fontweight='bold', pad=30)

#plt.savefig('sent_box.png')

In [None]:
#plt.plot(xs,p(xs),"r--")

xs = range (1,74)
z = np.polyfit(xs,avg,1)
p = np.poly1d(z)
ax = plt.plot(xs,p(xs),color = 'Red', lw=3)



In [None]:
# print (type(pos_str), pos_str)

In [None]:
# using the list of positive and negative sentiment string, remove stop words and create word cloud

import nltk
import re
from wordcloud import WordCloud

stopwords = nltk.corpus.stopwords.words("english")
exc = ['tl', 'dr']
stopwords.extend(exc)

pos_words = str()
for comms in pos_str:
    for com in comms:
        for words in re.findall('[a-zA-Z]+', com):
            words = words.lower().strip()
            if words not in stopwords:
                pos_words += " " + words
            
# print (pos_words)

neg_words = str()
for comms in neg_str:
    for com in comms:
        for words in re.findall('[a-zA-Z]+', com):
            words = words.lower().strip()
            if words not in stopwords:
                neg_words += " " + words
                            
# print (neg_words)


def plot_wordcloud(series,output_filename, bg, cm, title, c):
    from wordcloud import WordCloud
    wordcloud = WordCloud(width = 800, height = 400, background_color = bg, colormap = cm).generate(series)
    
    plt.figure (figsize = (20,10))
    plt.axis("off")
    plt.title(title, fontsize = 50, color= c, fontweight='bold', pad=30)
    plt.imshow(wordcloud, interpolation='bilinear')
    #wordcloud.to_file(output_filename + '.png')
    plt.savefig(output_filename + '.png')


In [None]:
plot_wordcloud(pos_words,'positive', 'white', 'magma', 'Positive Sentiment Word Cloud', 'r')

In [None]:
plot_wordcloud(neg_words,'negative', 'white', 'winter','Negative Sentiment Word Cloud', 'b')