# Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
# import matplotlib.pyplot as plt
# import seaborn as sns
# import altair as alt

# Functions  

I will define some functions to avoid repetation in the code.

In [2]:
def raw_date_to_datetime(raw_date):
    datetime_dict = {"Date": None, "MonthName": None, "Time": None}  # New row to replace
    datetime_dict["Date"] = local_datetime.strftime("%d-%m-%Y")  # Get the date in the format "DD.MM.YYYY"
    datetime_dict["Time"] = local_datetime.strftime("%H:%M%z")  # Get the time in format "HH:MM+HHMM"
    datetime_dict["MonthName"] = local_datetime.strftime("%B")  # Get the full month name
    datetime_dict["DayName"] = local_datetime.strftime("%A")  # Get the full day name

    return datetime_dict


def is_subscribed_to(subreddit_name, subscribed_subs_df):
    return (subscribed_subs_df == subreddit_name).any().any()

# Data Collecting

Get the data from the .csv files, clear and format a bit to get useful pandas DataFrames.  

## Different Data
* Subscribed Subreddits
* IP Logs
* Post Votes
* Comment Votes
* Posts
* Comments

In [3]:
# Data path
data_path = "./data/"

## Subscribed Subreddits

This data holds the subreddits that I am _currently_ subscribed to. It does not hold a history or any temporal data.  

Reddit subreddits doesn't include a way to distinguish them from each other by some kind of a tag or topic system. However, to better analyse them a tag system might be useful. From a list of basic tags subs can be tagged by hand. If the read data has a sub missing tags then it will automatically asks for tags from a pre-defined list of tags. After updating the tags data will be written onto the original .csv file.  

This data file also includes followed users (not subs). They are marked with a 'u_' prefix in the data. They will be filtered, but saved anyway since they might be useful in the future.

In [4]:
# Read the file, sort by name of the subreddits and reset the index after sorting
fname = "subscribed_subreddits.csv"
subreddits_df = pd.read_csv(data_path + fname).sort_values(by="subreddit").reset_index().drop(columns="index")

In [5]:
# Predefine the minimal amount of basic and useful tags
predefined_tags = tuple(sorted((
    'meme',
    'movie/show',
    'fandom',
    'game',
    'hobby',
    'information',
    'programming',
    'other',
    'user',
)))

retake_tags = False
try:
    # Check if the 'Tags' column exists at all
    sub_tags = subreddits_df["Tags"]
except KeyError:
    # Add an empty 'Tags' column
    subreddits_df["Tags"] = None
    sub_tags = subreddits_df["Tags"]
    retake_tags = True

# Check if there are any missing entry without tags
if sub_tags.isnull().sum() > 0:
    retake_tags = True

# Ask for tags
if retake_tags:
    save = False
    for idx in range(len(sub_tags)):
        sub = subreddits_df.loc[idx, "subreddit"]
        tags = subreddits_df.loc[idx, "Tags"]
        if type(tags) != str:
            tags = list()
            while True:
                tag = input(f"Enter a tag for the subreddit '{sub}', amongst {predefined_tags}\nEnter 'q' to finish.\n")
                if tag == "save":
                    subreddits_df.loc[idx, "Tags"] = ", ".join(sorted(tags))
                    save = True
                    break
                if tag == 'q':
                    if len(tags) != 0:
                        subreddits_df.loc[idx, "Tags"] = ", ".join(sorted(tags))
                        print("-----")
                        break
                    else:
                        print("You didn't enter any valid tags yet!")
                        continue
                if tag in predefined_tags and tag not in tags:
                    tags.append(tag)
                else:
                    print("You didn't enter a valid tag.")
        if save:
            break

subreddits_df.to_csv(data_path + fname, index=False)

In [6]:
followed_users_df = subreddits_df[subreddits_df["subreddit"].str.contains("u_")]
subreddits_df = subreddits_df[~subreddits_df["subreddit"].str.contains("u_")]

subreddits_df

Unnamed: 0,subreddit,Tags
0,AskScienceFiction,information
1,CodeBullet,other
2,DMAcademy,"game, hobby, information"
3,DaystromInstitute,"fandom, information, movie/show"
4,Deepspaceninememes,"fandom, meme, movie/show"
5,ElectroBOOM,"information, other"
6,ExposurePorn,"hobby, other"
7,FATErpg,"game, hobby, information"
8,GeekyaparLamers,other
9,GreekMythology,information


## IP Logs

IP logs data holds information about my logins to Reddit. It holds the date, time and the IP that I used. This data might be used on showing my active times even though it doesn't hold information on how long I have stayed active.  

The date data is in the form of "yyyy-mm-dd hh:mm:ss UTC". I will split the date and time, convert time into GMT+3, and name the months.

In [7]:
# Read the file, drop the first row that holds the registiration IP only, drop the IP colum and reset the indexing
fname = "ip_logs.csv"
login_datetime_df = pd.read_csv(data_path + fname).rename(columns={"date": "RawDate"}).drop(index=0, columns="ip").reset_index().drop(columns="index")

In [8]:
try:
    raw_date_col = login_datetime_df["RawDate"]  # Raw Date column

    # Add new columns
    login_datetime_df[["Date", "Time", "MonthName", "DayName"]] = None
    
    for idx in range(len(raw_date_col)):
        raw_date = raw_date_col.iloc[idx].replace(" UTC", "")  # Get the time in UTC time
    
        # Convert datetime to local time zone
        local_datetime = datetime.fromisoformat(raw_date).astimezone()
        datetime_dict = raw_date_to_datetime(local_datetime)

        # Add the ald raw date data to new dict
        datetime_dict["RawDate"] = raw_date + " UTC"
        
        login_datetime_df.iloc[idx] = datetime_dict  # Insert the new row

    login_datetime_df = login_datetime_df.drop(columns="RawDate")
except KeyError:
    pass

login_datetime_df["Date"] = pd.to_datetime(login_datetime_df["Date"], format="%d-%m-%Y")
login_datetime_df

Unnamed: 0,Date,Time,MonthName,DayName
0,2023-06-29,10:16+0300,June,Thursday
1,2023-06-29,14:25+0300,June,Thursday
2,2023-06-30,01:16+0300,June,Friday
3,2023-06-30,04:15+0300,June,Friday
4,2023-06-30,05:46+0300,June,Friday
...,...,...,...,...
357,2023-10-06,12:19+0300,October,Friday
358,2023-10-06,13:40+0300,October,Friday
359,2023-10-06,15:16+0300,October,Friday
360,2023-10-07,08:04+0300,October,Saturday


## Post Votes

This data includes the posts that I have upvoted or downvoted. It includes and ID, the post link and the type of the vote; however, no temporal data.  

I will get the subreddit name from the URL, my vote and I will compare the sub to the subscribed subs data and get wheter or not I am subscribed to that subreddit currently.

In [9]:
# Read the file, rename the vote direction and id columns.
fname = "post_votes.csv"
post_votes_df = pd.read_csv(data_path + fname).rename(columns={"direction": "Vote"}).drop(columns="id")

In [10]:
# Add the new columns
post_votes_df[["SubredditName", "IsSubscribed"]] = None

for idx in range(len(post_votes_df["permalink"])):
    post_vote_dict = {"SubredditName": None, "IsSubscribed": None}
        
    # Get the sub name from the link
    permalink = post_votes_df.loc[idx, "permalink"]
    start_idx = permalink.find("r/") + 2
    stop_idx = permalink.find("/", start_idx)
    sub_name = permalink[start_idx:stop_idx]
    post_vote_dict["SubredditName"] = sub_name

    # Check if the sub is subscribed
    post_vote_dict["IsSubscribed"] = is_subscribed_to(sub_name, subreddits_df)

    # Add the existing data to the new row dictionary
    post_vote_dict["permalink"] = permalink
    post_vote_dict["Vote"] = post_votes_df.loc[idx, "Vote"]

    # Add the new row data
    post_votes_df.iloc[idx] = post_vote_dict

# Drop the permalink column
post_votes_df = post_votes_df.drop(columns="permalink")
# Specify the Dtypes for later use
post_votes_df["IsSubscribed"] = post_votes_df["IsSubscribed"].astype(dtype="bool")
post_votes_df

Unnamed: 0,Vote,SubredditName,IsSubscribed
0,up,unexpectedMontyPython,True
1,up,ProgrammerHumor,True
2,up,TheLastAirbender,True
3,up,risa,True
4,up,unexpectedMontyPython,True
...,...,...,...
1217,up,gaming,True
1218,none,ProgrammerHumor,True
1219,up,seinfeld,True
1220,up,ProgrammerHumor,True


## Comment Votes  

Comment votes is almost identical to the post votes data except that this includes the information about comments that I have voted instead of posts.  

I will perform the same cleaning as the post votes data: Remove the ID, get the subreddit name from the URL, my vote and I will compare the sub to the subscribed subs data and get wheter or not I am subscribed to that subreddit currently.y.

In [11]:
# Read the file, rename the vote direction column and drop the id column.
fname = "comment_votes.csv"
comment_votes_df = pd.read_csv(data_path + fname).rename(columns={"direction": "Vote"}).drop(columns="id")

In [12]:
# Add the new columns
comment_votes_df[["SubredditName", "IsSubscribed"]] = None

for idx in range(len(comment_votes_df["permalink"])):
    comment_vote_dict = {"SubredditName": None, "IsSubscribed": None}
        
    # Get the sub name from the link
    permalink = comment_votes_df.loc[idx, "permalink"]
    start_idx = permalink.find("r/") + 2
    stop_idx = permalink.find("/", start_idx)
    sub_name = permalink[start_idx:stop_idx]
    comment_vote_dict["SubredditName"] = sub_name

    # Check if the sub is subscribed
    comment_vote_dict["IsSubscribed"] = is_subscribed_to(sub_name, subreddits_df)

    # Add the existing data to the new row dictionary
    comment_vote_dict["permalink"] = permalink
    comment_vote_dict["Vote"] = comment_votes_df.loc[idx, "Vote"]

    # Add the new row data
    comment_votes_df.iloc[idx] = comment_vote_dict

# Drop the permalink column
comment_votes_df = comment_votes_df.drop(columns="permalink")
# Specify the Dtypes for later use
comment_votes_df["IsSubscribed"] = comment_votes_df["IsSubscribed"].astype(dtype="bool")
comment_votes_df

Unnamed: 0,Vote,SubredditName,IsSubscribed
0,up,GenP,False
1,up,flashcarts,False
2,up,ProgrammerHumor,True
3,up,startrek,True
4,up,startrek,True
...,...,...,...
255,up,CodeBullet,True
256,up,montypython,True
257,none,seinfeld,True
258,up,TheLastAirbender,True


## Posts  

The posts data is about the posts that I have created. It includes an ID, a permalink to the post, posting date, the IP that I have used, subreddit name that the post has been posted, and gildings and url data.  

I will drop the permalink, IP, gildings, and url. Seperate the date to date and time columns, and check if I am subscribed to the sub I have posted. I will keep the IDs to compare with the comments data later on.

In [13]:
# Read the file, rename the id, date, and subreddit and drop the permalink, ip, gildings, and url.
fname = "post_headers.csv"
posts_df = pd.read_csv(data_path + fname).rename(columns={"id": "ID", "date": "RawDate", "subreddit": "Subreddit"}).drop(columns=["permalink", "ip", "gildings", "url"])

In [14]:
try:
    raw_date_col = posts_df["RawDate"]  # Raw Date column

    # Add new columns
    posts_df[["IsSubscribed", "Date", "Time", "MonthName", "DayName"]] = None
    
    for idx in range(len(raw_date_col)):
        raw_date = raw_date_col.iloc[idx].replace(" UTC", "")  # Get the time in UTC time
    
        # Convert datetime to local time zone
        local_datetime = datetime.fromisoformat(raw_date).astimezone()
        datetime_dict = raw_date_to_datetime(local_datetime)

        posts_dict = dict()
        # Check if subscribed
        posts_dict["IsSubscribed"] = is_subscribed_to(posts_df.loc[idx, "Subreddit"], subreddits_df)
        
        # Add the ald raw date data to new dict
        posts_dict["RawDate"] = raw_date + " UTC"
        posts_dict["ID"] = posts_df.loc[idx, "ID"]
        posts_dict["Subreddit"] = posts_df.loc[idx, "Subreddit"]

        
        posts_df.iloc[idx] = (posts_dict | datetime_dict)  # Insert the new row

    posts_df = posts_df.drop(columns="RawDate")
except KeyError:
    pass

posts_df["Date"] = pd.to_datetime(posts_df["Date"], format="%d-%m-%Y")
posts_df

Unnamed: 0,ID,Subreddit,IsSubscribed,Date,Time,MonthName,DayName
0,v7jv2a,consolerepair,True,2022-06-08,07:38+0300,June,Wednesday
1,m3jkjt,NintendoDSi,False,2021-03-12,15:13+0300,March,Friday
2,10x881w,startrek,True,2023-02-08,19:26+0300,February,Wednesday
3,126w0at,webdev,False,2023-03-30,18:58+0300,March,Thursday
4,r84lhi,flashcarts,False,2021-12-03,17:40+0300,December,Friday
5,126w1tz,webdev,False,2023-03-30,19:00+0300,March,Thursday
6,15rhuzw,montypython,True,2023-08-15,04:15+0300,August,Tuesday
7,16z0xzh,TheLastAirbender,True,2023-10-03,19:25+0300,October,Tuesday
8,10vxs5t,startrek,True,2023-02-07,09:33+0300,February,Tuesday
9,zx3wgi,consolerepair,True,2022-12-28,07:54+0300,December,Wednesday


## Comments  

Similar to the data about the posts, comments data also includes an ID, a permalink to the comment, comment date, the IP that I have used, subreddit name that the post that been commented has been posted, and gildings. It does not include a url data like posts and it holds two extra information: a link to the parent object and _if the parent is posted by me_ an ID of the parent.  

I will drop the permalink, IP, gildings, and link to the parent. Seperate the date to date and time columns, check if I am subscribed to the sub I have posted, and I will check if I was the author of the parent.