In [1]:
import re
import nltk
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import pytz
import datetime
import seaborn as sns
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
from tqdm import tqdm
import boto3

In [2]:
def get_db_reddit(table, utc_begin, utc_end):
    """
    Retrieves posts from our AWS PostgreSQL database within a specified UTC time range.

    Parameters:
    table (str): Name of the database table to query.
    utc_begin (int): The beginning of the UTC time range.
    utc_end (int): The end of the UTC time range.

    Returns:
    DataFrame: A pandas DataFrame containing the retrieved posts.

    Raises:
    psycopg2.DatabaseError: If an error occurs in database connection or execution.
    """
    connection_params = {
        "user": 'postgres',
        "password": 'Data2023',
        "host": 'mads-capstone.cmohac77hep9.eu-north-1.rds.amazonaws.com',
        "port": 5432,
        "database": "mads"
    }


    try:
        with psycopg2.connect(**connection_params) as db_connection:
            select_query = f"SELECT * FROM {table} WHERE created_utc >= %s AND created_utc <= %s;"

            cursor = db_connection.cursor()
            cursor.execute(select_query, (utc_begin, utc_end))
            rows = cursor.fetchall()
            
            df = pd.DataFrame(rows,)
            if table == 'submissions':
                df.columns= ["id", "subreddit_id", "subreddit", "author", "created_utc", "permalink", "title", "selftext", "num_comments", "score"]
            elif table == 'comments':
                df.columns= ["id", "subreddit_id","submission_id", "subreddit", "author", "created_utc","selftext", "score"]
            else:
                print('Table not found!')
                raise
                                  
            return df
        
    except psycopg2.DatabaseError as e:
        print(f"Database error: {e}")
        raise

In [3]:
def parse_record(row: str)-> dict:
    try:
        row_data = row.decode(encoding="utf-8", errors="replace").\
                                            replace("null", '""').\
                                            replace("&lt;", "<").\
                                            replace("&gt;", ">").\
                                            replace("&amp;#39;", "").\
                                            replace("&amp;", "").\
                                            replace("false", "False").\
                                            replace("true", "True")
                                            #replace(",", ";")
        #print(row_data)
        row_dict = eval(row_data)
        row_dict["selftext"] = row_dict["selftext"].replace(",", ";").replace("\n", " ").replace("\r", "").replace("\t", "").replace("\\", "") if row_dict["selftext"] else ""
        row_dict["title"] = row_dict["title"].replace(",", ";").replace("\n", " ").replace("\r", "").replace("\t", "").replace("\\", "") if row_dict["title"] else ""
        return row_dict
    except SyntaxError as e: ## Ignore syntax error records
        #print(e)
        pass
        #print(row_data)
    except UnicodeEncodeError as e: ## Ignore Decode error records
        #print(e)
        pass
    except Exception as e: ## Ignore all other errors for continuous processing.
        pass
    return {}

def save_dataframe(records):
    df = pd.DataFrame(records)
    #print(df.columns)
    df = df[selected_columns]
    df.to_csv("submissions.csv", index=False, header=False, encoding="utf-8", errors="replace") 

In [4]:
start_date = '2018-08-01'
end_date = '2023-11-30'
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

In [5]:
# Set UTC begin and end timestamps for each day

day1, day2 = date_range[-3], date_range[-2]

utc_begin = int(day1.replace(hour=23, minute=59, second=59, tzinfo=datetime.timezone.utc).timestamp())
utc_end = int((day2 + pd.Timedelta(days=1)).replace(hour=23, minute=59, second=59, tzinfo=datetime.timezone.utc).timestamp())  

utc_begin, utc_end, day1, day2

(1701215999,
 1701388799,
 Timestamp('2023-11-28 00:00:00'),
 Timestamp('2023-11-29 00:00:00'))

In [6]:
connection_params = {
        "user": 'postgres',
        "password": 'Data2023',
        "host": 'mads-capstone.cmohac77hep9.eu-north-1.rds.amazonaws.com',
        "port": 5432,
        "database": "mads"
    }

with psycopg2.connect(**connection_params) as db_connection:
    select_query = f"SELECT * FROM submissions WHERE created_utc >= %s AND created_utc <= %s;"

    cursor = db_connection.cursor()
    cursor.execute(select_query, (utc_begin, utc_end))
    rows = cursor.fetchall()
    
len(rows)

0

In [8]:
#get_db_reddit('submissions', utc_begin, utc_end)

In [7]:
bucket = 'mads-capstone-2023' 
data_key = 'wallstreetbets_submissions.json' 

selected_columns = ["id", "subreddit_id", "subreddit", "author", "created_utc", "permalink", "title", "selftext", "num_comments", "score",]

In [9]:
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=data_key)

records = []
failed_seqs = []

total_records = 2218243

for i, row in enumerate(obj["Body"].iter_lines()):
    if i<2218200:
        continue
    
    parsed_record = parse_record(row)
    if parsed_record:
        records.append(parsed_record)
            
    
    print(datetime.datetime.fromtimestamp(parsed_record["created_utc"], tz=datetime.timezone.utc))

2022-12-31 09:00:20+00:00
2022-12-31 09:05:32+00:00
2022-12-31 09:29:19+00:00
2022-12-31 09:35:43+00:00
2022-12-31 09:48:53+00:00
2022-12-31 09:53:38+00:00
2022-12-31 09:55:58+00:00
2022-12-31 09:56:03+00:00
2022-12-31 09:56:30+00:00
2022-12-31 09:58:59+00:00
2022-12-31 10:08:37+00:00
2022-12-31 10:12:01+00:00
2022-12-31 10:13:10+00:00
2022-12-31 10:13:18+00:00
2022-12-31 10:13:51+00:00
2022-12-31 10:14:39+00:00
2022-12-31 10:19:50+00:00
2022-12-31 10:24:22+00:00
2022-12-31 10:28:52+00:00
2022-12-31 10:29:47+00:00
2022-12-31 10:33:06+00:00
2022-12-31 10:44:12+00:00
2022-12-31 10:52:58+00:00
2022-12-31 10:55:55+00:00
2022-12-31 11:01:01+00:00
2022-12-31 11:03:25+00:00
2022-12-31 11:08:42+00:00
2022-12-31 11:19:09+00:00
2022-12-31 11:20:39+00:00
2022-12-31 11:21:04+00:00
2022-12-31 11:23:50+00:00
2022-12-31 11:31:35+00:00
2022-12-31 11:48:58+00:00
2022-12-31 11:59:29+00:00
2022-12-31 12:02:51+00:00
2022-12-31 12:10:13+00:00
2022-12-31 12:18:22+00:00
2022-12-31 12:29:12+00:00
2022-12-31 1