In [32]:
# This notebook was used to conduct preliminary exploration of data
# As such it is not a part of the main workflow, but used just for additional insights

# Import the required libraries
import pandas as pd
import time
from datetime import timedelta, datetime
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer
import os

new_working_directory = "/Users/fatimaq/Documents/Qualitative_reddit_analysis/Output"
os.chdir(new_working_directory)



# Create SQL engine; choose the correct path where reddit.db database file is located
engine = create_engine(
    f"sqlite:////Users/fatimaq/Documents/Qualitative_reddit_analysis/Output/testredditpost.sqlite")


In [33]:
# Select all the data from Table A in the database
posts = pd.read_sql("""select * from my_table order by createdAt""",engine)
posts.head()

Unnamed: 0,body,id,communityName,username,createdAt
0,I'm risking my pride by posting for advice to ...,t3_9rq4m,r/needadvice,gijyun,2009-10-07 15:25:04
1,"It's great that you were exercising, but i...",t1_c0e4m6u,r/needadvice,steelproboscis,2009-10-07 17:03:09
2,It is physically impossible to gain weight...,t1_c0efbdb,r/needadvice,aurisor,2009-10-14 15:36:41
3,Thanks - I'm pasting a reply I put in a di...,t1_c0efio8,r/needadvice,gijyun,2009-10-14 17:35:04
4,Why in the world would you goto a doctor f...,t1_c0ehv4e,r/needadvice,[deleted],2009-10-15 22:43:02


In [34]:
# read the total length of posts
len(posts)

39551

In [35]:
posts.tail()

Unnamed: 0,body,id,communityName,username,createdAt
39546,umm pretty sure that is illegal to throw a...,t1_jsyevaa,r/OzempicForWeightLoss,Nofeardiver,2023-07-22 03:26:12
39547,You still could take it since you aren't a...,t1_jsyfbt3,r/WegovyWeightLoss,alwayscurious4life,2023-07-22 03:30:33
39548,Does anybody worry about peptides from Chi...,t1_jsyg80g,r/TirzepatideSource,Plus_Twist9222,2023-07-22 03:39:10
39549,I got your response and then I don’t know ...,t1_jsyj7zv,r/Semaglutide,EatBlueberries,2023-07-22 04:08:24
39550,TDEE is way off of you are over 300lbs… wh...,t1_jsyknig,r/WegovyWeightLoss,girlandy1,2023-07-22 04:22:48


In [50]:
# Convert the Unix epoch time to human readable dates and set index
# Convert 'createdAt' column to datetime objects
posts['createdAt'] = pd.to_datetime(posts['createdAt'])

# Convert datetime objects to Unix epoch timestamps
posts['createdAt'] = posts['createdAt'].apply(lambda x: int(x.timestamp()))

posts['date'] = posts['createdAt'] .map(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d'))
posts.set_index('date', inplace=True)
posts.head()

Unnamed: 0_level_0,body,id,communityName,username,createdAt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-07-22,TDEE is way off of you are over 300lbs… wh...,t1_jsyknig,r/WegovyWeightLoss,girlandy1,1689999768
2023-07-22,I got your response and then I don’t know ...,t1_jsyj7zv,r/Semaglutide,EatBlueberries,1689998904
2023-07-22,Does anybody worry about peptides from Chi...,t1_jsyg80g,r/TirzepatideSource,Plus_Twist9222,1689997150
2023-07-22,You still could take it since you aren't a...,t1_jsyfbt3,r/WegovyWeightLoss,alwayscurious4life,1689996633
2023-07-22,umm pretty sure that is illegal to throw a...,t1_jsyevaa,r/OzempicForWeightLoss,Nofeardiver,1689996372


In [51]:
# Sort from latest to oldest
posts.sort_index(inplace=True,ascending=False)
posts.head()

Unnamed: 0_level_0,body,id,communityName,username,createdAt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-07-22,TDEE is way off of you are over 300lbs… wh...,t1_jsyknig,r/WegovyWeightLoss,girlandy1,1689999768
2023-07-22,I got your response and then I don’t know ...,t1_jsyj7zv,r/Semaglutide,EatBlueberries,1689998904
2023-07-22,Does anybody worry about peptides from Chi...,t1_jsyg80g,r/TirzepatideSource,Plus_Twist9222,1689997150
2023-07-22,You still could take it since you aren't a...,t1_jsyfbt3,r/WegovyWeightLoss,alwayscurious4life,1689996633
2023-07-22,umm pretty sure that is illegal to throw a...,t1_jsyevaa,r/OzempicForWeightLoss,Nofeardiver,1689996372


In [52]:
# Check the number of unique users and reddit posts per subreddit/community 
users = posts[['communityName','username','body']].groupby(['communityName'],as_index=False).nunique()
users.sort_values(['body'],inplace=True, ascending=False)
users.reset_index(inplace=True, drop=True)
users

Unnamed: 0,communityName,username,body
0,r/Mounjaro,1933,5271
1,r/Semaglutide,1878,3829
2,r/Tirzepatide,866,2754
3,r/liraglutide,931,2234
4,r/Ozempic,972,1959
...,...,...,...
326,r/CompundedSemaglutide,1,1
327,r/clinicalresearch,1,1
328,r/brisbane,1,1
329,r/biotech_stocks,1,1


In [53]:
# Save the data to local disk
users.to_excel('UsersCount.xlsx')


In [54]:
# Check the daily distribution of number of posts published by all users in all the selected subreddits over time
distribution = posts[['body']].groupby(posts.index).nunique()
distribution.head()

Unnamed: 0_level_0,body
date,Unnamed: 1_level_1
2009-10-07,2
2009-10-14,2
2009-10-15,1
2009-10-16,1
2009-10-28,1


In [55]:
distribution.tail()

Unnamed: 0_level_0,body
date,Unnamed: 1_level_1
2023-07-18,340
2023-07-19,188
2023-07-20,306
2023-07-21,292
2023-07-22,28


In [41]:
# Save the time-series on local disk for further processing
distribution.to_csv('postsTS.csv')

In [42]:
# Initiate the CounteVectorizer and count the number of Keywords; mainly unigrams, bigrams and trigrams
# Rerun the cell by changing the ngram_range parameter for bigrams and trigrams once it is done running for unigrams

start = time.monotonic()

cv = CountVectorizer(ngram_range=(1,1),stop_words='english')  #Also use; ngram_range=(2,2) & (3,3) for bigrams & trigrams

cvFit = cv.fit_transform([str(posts['body'].values.tolist())])

wordList = cv.get_feature_names_out()
countList = cvFit.toarray().sum(axis=0)

dictionary = dict(zip(wordList,countList))
dictionary = dict(sorted(dictionary.items(), key = lambda x: x[1], reverse=True))

cvDf = pd.DataFrame(list(dictionary.items()), columns=['keywords','count'])

end = time.monotonic()

print(f"Total time taken: {timedelta(seconds=(end-start))}")

cvDf.head()

Total time taken: 0:00:00.770964


Unnamed: 0,keywords,count
0,weight,14503
1,just,12079
2,like,9603
3,people,9328
4,don,8014


In [43]:
cvDf.tail()

Unnamed: 0,keywords,count
41782,životni,1
41783,životnih,1
41784,سعيد,1
41785,不带盐,1
41786,醋酸替度鲁肽,1


In [44]:
# Finally save the "keywordsBigrams100.csv" and "keywordsTrigrams100.csv" files by repeating the same previous step
# Keywords with greater than or equal to 100 mentions only
cvDf[cvDf['count'] >= 100].to_csv("keywordsUnigrams100.csv")

In [45]:
# Count the number of unique Reddit users(ie authors)
posts['username'].nunique()

14595