In [8]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import json
from dotenv import load_dotenv

import pandas as pd
from sqlalchemy import create_engine

#### Check if data files have been extracted

In theory, this notebook should be run after `EDA.ipynb` where we perform our intial data exploratory.
In that notebook, we unzip the compressed data file into two `.ndjson` for submission and comment data.

In [51]:
comments_data_path = os.path.join("..", "data", "extracted_data", "comments_data.ndjson")
submissions_data_path = os.path.join("..", "data", "extracted_data", "submissions_data.ndjson")

assert os.path.isfile(comments_data_path) is True
assert os.path.isfile(submissions_data_path) is True

In [23]:
def upload_to_db(db_engine, data_file: str, table_name: str, selected_columns: list[str]):
    """
    Upload 
    """

    assert os.path.isfile(data_file) is True
    assert data_file.endswith('.ndjson') is True

    with open(data_file, 'r') as f:
        done = False
        iter = 1
        while not done:
            jsons = []
            # 1000 lines at a time
            for i in range(1000):
                line = f.readline().strip()

                # end of line is reached
                if not line:
                    done = True
                    break

                json_line = json.loads(line)
                jsons.append(json_line)
            
            if done:
                break

            df = pd.DataFrame(jsons)
            # only select relevant columns
            for col in selected_columns:
                # NOTE: some json entries may not contain the selected column,  
                # we will that as NA
                if col not in df.columns:
                    df[col] = pd.NA

            df = df[selected_columns]
            df.to_sql(table_name, db_engine, if_exists='append', index=False)
            print(f'iteration #{iter}: data written to {table_name} successfully.')
            iter += 1

#### Create database engine

In [19]:
dotenv_path = '../.env'
load_dotenv(dotenv_path)

def get_db_url(db_name):
    db_str = 'postgresql+psycopg2://{}:{}@{}:{}/{}'
    return db_str.format(
        os.getenv('DB_USERNAME'),
        os.getenv('DB_PASSWORD'),
        os.getenv('DB_HOST'),
        os.getenv('DB_PORT'),
        db_name
    )

db_url = get_db_url('reddit')
print(db_url)
db_engine = create_engine(db_url)

# TODO: ensure db_engine is successfully created

postgresql+psycopg2://user:youshallnotpass@bachtran.dev:5432/reddit


In [55]:
!psql postgresql://user:youshallnotpass@bachtran.dev:5432 \d+

psql: error: connection to server at "bachtran.dev" (38.75.137.169), port 5432 failed: FATAL:  database "user" does not exist


In [53]:
data_file = comments_data_path
selected_columns = relevant_comment_columns
with open(data_file, 'r') as f:
        done = False
        iter = 1
        jsons = []
        # 1000 lines at a time
        for i in range(1000):
            line = f.readline().strip()

            # end of line is reached
            if not line:
                done = True
                break

            json_line = json.loads(line)
            jsons.append(json_line)
        

        df = pd.DataFrame(jsons)
        # only select relevant columns
        for col in selected_columns:
            # NOTE: some json entries may not contain the selected column,  
            # we will that as NA
            if col not in df.columns:
                df[col] = pd.NA

        df = df[selected_columns]
df

Unnamed: 0,author,author_created_utc,body,created_utc,id,locked,link_id,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,archived,downs,updated_on,ups
0,amayain,,I agree that it is a problem. Sometimes you c...,1309557780,c233p63,,t3_iejzu,t3_iejzu,,1427305683,3,AcademicPsychology,t5_2sluh,,,True,0,,3
1,amayain,,I thought I might get the ball rolling on this...,1309558050,c233q0r,,t3_iekel,t3_iekel,,1427305694,2,AcademicPsychology,t5_2sluh,,,True,0,,2
2,nicson123,,"Yes, the issue of providing redditors w/ copie...",1309559835,c233vof,,t3_iejzu,t3_iejzu,,1427305769,3,AcademicPsychology,t5_2sluh,,,True,0,,3
3,drooze,,I'm involved with a study using smart phones t...,1309560915,c233yu8,,t3_ielfu,t3_ielfu,,1427305822,2,AcademicPsychology,t5_2sluh,,,True,0,,2
4,drooze,,If you find an article through Psychinfo or on...,1309561086,c233zdz,,t3_iejzu,t3_iejzu,,1427305820,2,AcademicPsychology,t5_2sluh,,,True,0,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ilikebluepens,,thirded,1327650689,c3l54o3,,t3_om4q1,t1_c3ieab3,,1428248476,0,AcademicPsychology,t5_2sluh,,,True,0,,0
996,elizamcmanus,,You should wear a suit for your interviews. If...,1327711284,c3ldibz,,t3_oxy1l,t3_oxy1l,,1428252488,1,AcademicPsychology,t5_2sluh,,,True,0,,1
997,plasticasparagus,,You are able to use expressive arts therapy if...,1327737037,c3lgt9a,,t3_li84o,t1_c3j9tow,,1428254044,1,AcademicPsychology,t5_2sluh,,,True,0,,1
998,kanagawa,,I haven't done any phone interviews in academi...,1327790102,c3lm5c0,,t3_p0wzu,t3_p0wzu,,1428256630,6,AcademicPsychology,t5_2sluh,,,True,0,,6


#### Upload data to database

Comments:

In [24]:
# define relevant comment columns
# this is the result of the EDA we did on the Reddit dataset
relevant_comment_columns = [
    'author',
    'author_created_utc',
    'body',
    'created_utc',
    'id',
    'locked',
    'link_id',
    'parent_id',
    'permalink',
    'retrieved_on',
    'score',
    'subreddit',
    'subreddit_id',
    'subreddit_name_prefixed',
    'subreddit_type',
    'archived',
    'downs',
    'updated_on',
    'ups'
]

upload_to_db(db_engine, comments_data_path, 'comments', relevant_comment_columns)

iteration #1: data written to comments successfully.
iteration #2: data written to comments successfully.
iteration #3: data written to comments successfully.
iteration #4: data written to comments successfully.
iteration #5: data written to comments successfully.
iteration #6: data written to comments successfully.
iteration #7: data written to comments successfully.
iteration #8: data written to comments successfully.
iteration #9: data written to comments successfully.
iteration #10: data written to comments successfully.
iteration #11: data written to comments successfully.
iteration #12: data written to comments successfully.
iteration #13: data written to comments successfully.
iteration #14: data written to comments successfully.
iteration #15: data written to comments successfully.
iteration #16: data written to comments successfully.
iteration #17: data written to comments successfully.
iteration #18: data written to comments successfully.
iteration #19: data written to commen

Submissions:

In [None]:
# define relevant submission columns
# this is the result of the EDA we did on the Reddit dataset
relevant_submission_columns = [
    "id", "downs", "ups", "archived", "author", 
    "author_created_utc", "subreddit", "subreddit_id", 
    "subreddit_subscribers", "subreddit_type", "title", 
    "url", "num_comments", "permalink",
    "is_self", "selftext", "created_utc", "spoiler", "locked"
]

upload_to_db(db_engine, submissions_data_path, 'submissions', relevant_submission_columns)

In [2]:
!pip install ipython-sql



In [2]:
%load_ext sql
%sql postgresql://user:youshallnotpass@bachtran.dev:5432/reddit

In [3]:
%config SqlMagic.style = '_DEPRECATED_DEFAULT'

In [11]:
%%sql
SELECT * FROM comments LIMIT 5;

 * postgresql://user:***@bachtran.dev:5432/reddit
5 rows affected.


id,archived,author,author_created_utc,author_fullname,body,controversiality,created_utc,downs,edited,locked,name,num_reports,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,total_awards_received,updated_on,ups,link_id
hct0pjt,False,[deleted],,,[removed],,1631617422,,,False,,,t3_pnwd4g,/r/AcademicPsychology/comments/pnwd4g/what_complements_a_psychology_degree_more_a/hct0pjt/,1650179167,0,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,,t3_pnwd4g
hcvu9qc,False,[deleted],,,[removed],,1631661903,,,False,,,t1_hcvic5d,/r/AcademicPsychology/comments/poc1ur/apa_accredited_programs/hcvu9qc/,1650170905,2,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,,t3_poc1ur
hcvubnh,False,[deleted],,,[removed],,1631661928,,,False,,,t1_hcvtnch,/r/AcademicPsychology/comments/poc1ur/apa_accredited_programs/hcvubnh/,1650170901,6,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,,t3_poc1ur
hcw04jl,False,[deleted],,,[removed],,1631664644,,,False,,,t1_hcvi1bi,/r/AcademicPsychology/comments/poc1ur/apa_accredited_programs/hcw04jl/,1650170432,2,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,,t3_poc1ur
hcwbepj,False,[deleted],,,[removed],,1631669800,,,False,,,t1_hcvubnh,/r/AcademicPsychology/comments/poc1ur/apa_accredited_programs/hcwbepj/,1650169530,1,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,,t3_poc1ur


In [28]:
relevant_comment_columns = [
    'author',
    'author_created_utc',
    'body',
    'created_utc',
    'id',
    'locked',
    'link_id',
    'parent_id',
    'permalink',
    'retrieved_on',
    'score',
    'subreddit',
    'subreddit_id',
    'subreddit_name_prefixed',
    'subreddit_type',
    'archived',
    'downs',
    'updated_on',
    'ups'
]
relevant_submission_columns = [
    "id", "downs", "ups", "archived", "author", 
    "author_created_utc", "subreddit", "subreddit_id", 
    "subreddit_subscribers", "subreddit_type", "title", 
    "url", "num_comments", "permalink",
    "is_self", "selftext", "created_utc", "spoiler", "locked"
]

In [29]:
def print_repeated_selects(column_names):
    for column_name in column_names:
        print(f"(ARRAY_AGG({column_name}))[1] AS {column_name}, ")
    
print_repeated_selects(relevant_submission_columns)

(ARRAY_AGG(id))[1] AS id, 
(ARRAY_AGG(downs))[1] AS downs, 
(ARRAY_AGG(ups))[1] AS ups, 
(ARRAY_AGG(archived))[1] AS archived, 
(ARRAY_AGG(author))[1] AS author, 
(ARRAY_AGG(author_created_utc))[1] AS author_created_utc, 
(ARRAY_AGG(subreddit))[1] AS subreddit, 
(ARRAY_AGG(subreddit_id))[1] AS subreddit_id, 
(ARRAY_AGG(subreddit_subscribers))[1] AS subreddit_subscribers, 
(ARRAY_AGG(subreddit_type))[1] AS subreddit_type, 
(ARRAY_AGG(title))[1] AS title, 
(ARRAY_AGG(url))[1] AS url, 
(ARRAY_AGG(num_comments))[1] AS num_comments, 
(ARRAY_AGG(permalink))[1] AS permalink, 
(ARRAY_AGG(is_self))[1] AS is_self, 
(ARRAY_AGG(selftext))[1] AS selftext, 
(ARRAY_AGG(created_utc))[1] AS created_utc, 
(ARRAY_AGG(spoiler))[1] AS spoiler, 
(ARRAY_AGG(locked))[1] AS locked, 


# Removes Duplicate comments

In [24]:
%%sql
DROP VIEW comments_without_duplicates;
CREATE MATERIALIZED VIEW comments_without_duplicates AS (
    SELECT id,
    (ARRAY_AGG(author))[1] AS author, 
    (ARRAY_AGG(author_created_utc))[1] AS author_created_utc, 
    (ARRAY_AGG(body))[1] AS body, 
    (ARRAY_AGG(created_utc))[1] AS created_utc, 
    (ARRAY_AGG(locked))[1] AS locked, 
    (ARRAY_AGG(link_id))[1] AS link_id, 
    (ARRAY_AGG(parent_id))[1] AS parent_id, 
    (ARRAY_AGG(permalink))[1] AS permalink, 
    (ARRAY_AGG(retrieved_on))[1] AS retrieved_on, 
    (ARRAY_AGG(score))[1] AS score, 
    (ARRAY_AGG(subreddit))[1] AS subreddit, 
    (ARRAY_AGG(subreddit_id))[1] AS subreddit_id, 
    (ARRAY_AGG(subreddit_name_prefixed))[1] AS subreddit_name_prefixed, 
    (ARRAY_AGG(subreddit_type))[1] AS subreddit_type, 
    (ARRAY_AGG(archived))[1] AS archived, 
    (ARRAY_AGG(downs))[1] AS downs, 
    (ARRAY_AGG(updated_on))[1] AS updated_on, 
    (ARRAY_AGG(ups))[1] AS ups
    FROM comments
    GROUP BY id
)

 * postgresql://user:***@bachtran.dev:5432/reddit
Done.
999938 rows affected.


[]

In [26]:
%%sql
CREATE UNIQUE INDEX comment_id ON comments_without_duplicates (id);

 * postgresql://user:***@bachtran.dev:5432/reddit
Done.


[]

In [30]:
%%sql
CREATE MATERIALIZED VIEW submissions_without_duplicates AS (
    SELECT id,
    (ARRAY_AGG(downs))[1] AS downs, 
    (ARRAY_AGG(ups))[1] AS ups, 
    (ARRAY_AGG(archived))[1] AS archived, 
    (ARRAY_AGG(author))[1] AS author, 
    (ARRAY_AGG(author_created_utc))[1] AS author_created_utc, 
    (ARRAY_AGG(subreddit))[1] AS subreddit, 
    (ARRAY_AGG(subreddit_id))[1] AS subreddit_id, 
    (ARRAY_AGG(subreddit_subscribers))[1] AS subreddit_subscribers, 
    (ARRAY_AGG(subreddit_type))[1] AS subreddit_type, 
    (ARRAY_AGG(title))[1] AS title, 
    (ARRAY_AGG(url))[1] AS url, 
    (ARRAY_AGG(num_comments))[1] AS num_comments, 
    (ARRAY_AGG(permalink))[1] AS permalink, 
    (ARRAY_AGG(is_self))[1] AS is_self, 
    (ARRAY_AGG(selftext))[1] AS selftext, 
    (ARRAY_AGG(created_utc))[1] AS created_utc, 
    (ARRAY_AGG(spoiler))[1] AS spoiler, 
    (ARRAY_AGG(locked))[1] AS locked
    FROM submissions
    GROUP BY id
)

 * postgresql://user:***@bachtran.dev:5432/reddit
839000 rows affected.


[]

In [31]:
%%sql
CREATE UNIQUE INDEX submission_id ON submissions_without_duplicates (id);

 * postgresql://user:***@bachtran.dev:5432/reddit
Done.


[]

In [46]:
%%sql
SELECT link_id FROM comments_without_duplicates LIMIT 5;

 * postgresql://user:***@bachtran.dev:5432/reddit
5 rows affected.


link_id
t3_11xoc1i
t3_11xo5ov
t3_11x5eva
t3_11xo5ov
t3_11xo5ov


In [5]:
%%sql
    SELECT p.parent_id, COUNT(*) AS num_replies
    FROM comments_without_duplicates AS p
    GROUP BY p.parent_id
    ORDER BY num_replies DESC
    LIMIT 5
    ;

 * postgresql://user:***@bachtran.dev:5432/reddit
5 rows affected.


parent_id,num_replies
t3_16ojhbf,782
t3_17ku7x2,740
t3_18hq9qz,739
t3_177eiqv,585
t3_16q4nwt,460


In [9]:
%%sql
DROP MATERIALIZED VIEW comments_with_num_replies;
CREATE MATERIALIZED VIEW comments_with_num_replies AS (
    WITH parent_child_comments AS (
    SELECT p.parent_id AS pid, COUNT(*) AS num_replies
    FROM comments_without_duplicates AS p
    GROUP BY p.parent_id
)
SELECT *
FROM comments_without_duplicates AS c
INNER JOIN parent_child_comments AS pc
ON c.id = pc.pid
);

 * postgresql://user:***@bachtran.dev:5432/reddit
Done.
0 rows affected.


[]

In [4]:
%%sql
SELECT *
FROM comments_without_duplicates AS c
WHERE c.parent_id = 't3_iejzu'
;

 * postgresql://user:***@bachtran.dev:5432/reddit
11 rows affected.


id,author,author_created_utc,body,created_utc,locked,link_id,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,archived,downs,updated_on,ups
c233p63,amayain,,I agree that it is a problem. Sometimes you can find the articles on the researchers' personal webpages (although publishers have a big problem with this). I'm not sure if there is a good (legal) way to solve this problem.,1309557780,,t3_iejzu,t3_iejzu,,1427305683,3,AcademicPsychology,t5_2sluh,,,True,0,,3
c233vof,nicson123,,"Yes, the issue of providing redditors w/ copies of the sources is going to be tough.,.. but as amayain said, sometimes researchers put their stuff up on their website. I'll be starting a general psych master's program this fall, and I look forward to discussion with everyone!",1309559835,,t3_iejzu,t3_iejzu,,1427305769,3,AcademicPsychology,t5_2sluh,,,True,0,,3
c233zdz,drooze,,"If you find an article through Psychinfo or one of the other non-public databases, try doing a search to see if it's been published in pubmed or the APA's open databases as well. Not perfect, but it might help.",1309561086,,t3_iejzu,t3_iejzu,,1427305820,2,AcademicPsychology,t5_2sluh,,,True,0,,2
c2348x2,Princess_By_Day,,Google Scholar is pretty solid for those without access to JSTOR and the like.,1309564339,,t3_iejzu,t3_iejzu,,1427305945,5,AcademicPsychology,t5_2sluh,,,True,0,,5
c23497a,avrgeawkdpenguin,,We can post citations....,1309564422,,t3_iejzu,t3_iejzu,,1427305948,4,AcademicPsychology,t5_2sluh,,,True,0,,4
c234fjc,paraprax,,"This is a problem for academia in general. We want to help the general public, but our work is not made accessible to them. So, it just ends up being a lot of scholars writing for other scholars. (OK, it's not quite that bad, but it can sure feel like it sometimes).",1309566745,,t3_iejzu,t3_iejzu,,1427306038,5,AcademicPsychology,t5_2sluh,,,True,0,,5
c234nfm,evt,,"If I want to talk to someone about an article, I will generally download the article from behind the paywall, then reupload it to google docs and share.",1309569661,,t3_iejzu,t3_iejzu,,1427306132,7,AcademicPsychology,t5_2sluh,,,True,0,,7
c234ssm,siamesekitten,,"&gt;I think it's important to differentiate between articles in Psychology Today and peer reviewed journal articles. THIS a thousand times!! So happy about this subreddit. I also have access to a database of journals through school. If those with access like ourselves want to discuss a specific article, we could retrieve it from the database and maybe set something up in google documents? Is that legal? Before doing that though, we could cite the article and see if other are able to retrieve it themselves. Through school, etc.",1309571668,,t3_iejzu,t3_iejzu,,1427306202,3,AcademicPsychology,t5_2sluh,,,True,0,,3
c236pok,Dr_fish,,Why do single journal articles cost so much to view? I've always found it kind of strange...,1309602074,,t3_iejzu,t3_iejzu,,1427307104,2,AcademicPsychology,t5_2sluh,,,True,0,,2
c237u4b,[deleted],,"Well, there's Mendeley (the ""social network"" for scientists). In there we could create a Reddit closed group and have all the papers inside for redditors to download. For now I'll create Mendeley group and the ones who have journal acces can add all the papers that have been submitted so far, we can see if it has any success, and if it does then we'll figure out what problems arise. I'll post here the group name in a minute",1309624631,,t3_iejzu,t3_iejzu,,1428199506,1,AcademicPsychology,t5_2sluh,,,True,0,,1
