<a href="https://colab.research.google.com/github/cjunwon/2022-fa-stats21/blob/main/DataRes_Research_W2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Extraction Code Reference from: https://colab.research.google.com/drive/15883QxK-f3Extq4dHRdqz1RdbgObzS_l?usp=sharing

In [2]:
!pip install colab-env --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colab-env
  Downloading colab-env-0.2.0.tar.gz (4.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv<1.0,>=0.10.0
  Downloading python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: colab-env
  Building wheel for colab-env (setup.py) ... [?25l[?25hdone
  Created wheel for colab-env: filename=colab_env-0.2.0-py3-none-any.whl size=3837 sha256=4077310ee3fd2711b0a708c7d96cf2c9c4cfea6cb98fc06c2bcc22701987e6b7
  Stored in directory: /root/.cache/pip/wheels/1c/65/0c/5552431f2622d6e0283e3dba61c6837103a9cbdbd89b7b0cba
Successfully built colab-env
Installing collected packages: python-dotenv, colab-env
Successfully installed colab-env-0.2.0 python-dotenv-0.21.1


In [7]:
!pip install praw

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praw
  Downloading praw-7.6.1-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.8/188.8 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.5.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.5.0


In [3]:
import colab_env
import os

Mounted at /content/gdrive


In [4]:
colab_env.__version__

'0.2.0'

In [5]:
reddit_client_id = os.getenv('reddit_client_id')
reddit_client_secret = os.getenv('reddit_client_secret')
reddit_user_agent = os.getenv('reddit_user_agent')

In [6]:
colab_env.RELOAD()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
import praw
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from google.colab import files

In [10]:
#===========================================================================
# helper functions
#===========================================================================

#---------------------------------------------------------------------------
# get all top post from a specific subreddit as a dataframe
def getHotPostsAsDF(r, subredditName, postLimit=10):
  df = pd.DataFrame()   
  subreddit = r.subreddit(subredditName) 
  for submission in subreddit.hot(limit=postLimit):
    df = df.append({'subreddit': subredditName,
                  'post_id': submission.id,
                  'created': datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                  'raw_title': submission.title,
                  'author': submission.author,
                  'karma': round(submission.score),
                  'awards': round(submission.total_awards_received),
                  'permalink': submission.permalink},
                 ignore_index = True)
  df['created'] = pd.to_datetime(df['created'], format='%Y-%m-%d %H:%M:%S')     
  return(df)


#---------------------------------------------------------------------------
# get all comments from a specific post as a dataframe
def getAllCommentsAsDF(r, submissionId):
  df = pd.DataFrame()   
  res = getAllComments(r, submissionId)
  for item in res:
      if type(item) == praw.models.reddit.comment.Comment:
          parent_id_clean = item.parent_id.replace("t1_","").replace("t3_","")
          if parent_id_clean==submissionId:
            parent_id_clean = ''
          df = df.append({'post_id':submissionId,
                          'comment_id': item.id,
                          'comment_parent_id': parent_id_clean,
                          'author': item.author,
                          'raw_comment': item.body,
                          'karma': round(item.score),
                          'awards': round(item.total_awards_received)},  
                  ignore_index = True)
  return(df)   

#---------------------------------------------------------------------------
# get all comments on this submission as praw objects in a list
# requires a submissionId, the id of the reddit post
def getAllComments(r, submissionId, verbose=True):
  submission = r.submission(submissionId)
  comments = submission.comments
  commentsList = []
  for comment in comments:
    getSubComments(comment, commentsList, verbose=verbose)
  return commentsList

#---------------------------------------------------------------------------
# recursive function to get nexted comments that replied to other comments
def getSubComments(comment, allComments, verbose=True):
  allComments.append(comment)
  if not hasattr(comment, "replies"):
    replies = comment.comments()
    if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
  else:
    replies = comment.replies
  for child in replies:
    getSubComments(child, allComments, verbose=verbose) 

In [11]:
# create a reddit instance using the API key
reddit = praw.Reddit(client_id=reddit_client_id,
                     client_secret=reddit_client_secret,
                     user_agent=reddit_user_agent)

In [12]:
#===========================================================================
# get the latest 500 hot posts from /r/UCLA
latest_posts = getHotPostsAsDF(reddit, "UCLA", 500)

# save as local CSV 
latest_posts.to_csv('reddit_posts_UCLA.csv') 
files.download('reddit_posts_UCLA.csv')

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
latest_posts

Unnamed: 0,subreddit,post_id,created,raw_title,author,karma,awards,permalink
0,UCLA,10p6dfk,2023-01-30 16:00:10,"Winter Quarter Club Recruitment, Student Activ...",AutoModerator,5.0,0.0,/r/ucla/comments/10p6dfk/winter_quarter_club_r...
1,UCLA,10rv8x4,2023-02-02 17:19:39,Why?,lildudie,106.0,0.0,/r/ucla/comments/10rv8x4/why/
2,UCLA,10rsyzl,2023-02-02 15:48:21,Powell Cat is way bigger up close lol,Alec119,95.0,0.0,/r/ucla/comments/10rsyzl/powell_cat_is_way_big...
3,UCLA,10s3oo3,2023-02-02 22:56:45,The “R” button in the elevator can do wonders,UCLAROOFS,12.0,0.0,/r/ucla/comments/10s3oo3/the_r_button_in_the_e...
4,UCLA,10s2h6s,2023-02-02 22:09:12,What does the pi symbol mean?,Redditlogicking,9.0,0.0,/r/ucla/comments/10s2h6s/what_does_the_pi_symb...
...,...,...,...,...,...,...,...,...
495,UCLA,10lgen6,2023-01-26 01:56:45,Available Swipes,flannelenjoyer04,2.0,0.0,/r/ucla/comments/10lgen6/available_swipes/
496,UCLA,10lcy56,2023-01-25 23:18:56,how to get to ms 2nd floor,bp86373,3.0,0.0,/r/ucla/comments/10lcy56/how_to_get_to_ms_2nd_...
497,UCLA,10l6w2p,2023-01-25 19:13:56,Brown Tabby Cat Midvale x Strathmore,Parking_Cranberry935,6.0,0.0,/r/ucla/comments/10l6w2p/brown_tabby_cat_midva...
498,UCLA,10lc1ej,2023-01-25 22:41:02,Cafe 451 employment,Fickle-Reporter-7201,3.0,0.0,/r/ucla/comments/10lc1ej/cafe_451_employment/


In [None]:
# Get the top 1000 submissions from r/UCLA
subreddit = reddit.subreddit("UCLA")
posts = subreddit.top(limit=1000)

for submission in subreddit.hot(limit=10):
    print("Title: ", submission.title)
    print("Score: ", submission.score)
    print("URL: ", submission.url)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Title:  Winter Quarter Club Recruitment, Student Activities, and other happenings Megathread - Week of January 23, 2023
Score:  6
URL:  https://www.reddit.com/r/ucla/comments/10jfv9t/winter_quarter_club_recruitment_student/
Title:  Powell Cat, meet Sproul Cat
Score:  214
URL:  https://www.reddit.com/gallery/10ohn2o
Title:  not rain again argh
Score:  31
URL:  https://www.reddit.com/r/ucla/comments/10ordy3/not_rain_again_argh/
Title:  Least inconsiderate bird user:
Score:  85
URL:  https://i.redd.it/gbzfy0q233fa1.jpg
Title:  UCLA housing get your shit together
Score:  6
URL:  https://www.reddit.com/r/ucla/comments/10ow5hu/ucla_housing_get_your_shit_together/
Title:  Why do people feel the need to talk as loud as they can in the study lounge?
Score:  9
URL:  https://www.reddit.com/r/ucla/comments/10orvlm/why_do_people_feel_the_need_to_talk_as_loud_as/
Title:  To whoever was just playing the Covel piano...
Score:  13
URL:  https://www.reddit.com/r/ucla/comments/10opbrv/to_whoever_was_just

In [None]:
# Create a networkx graph
graph = nx.Graph()

# Add the authors to the graph as nodes
for post in posts:
    author = post.author
    if author is not None:
        # graph.add_node(author)
        print(post, author)

# # Add the relationships between the authors to the graph as edges
# for post in posts:
#     author = post.author
#     commentors = [comment.author for comment in post.comments if comment.author is not None]
#     for commentor in commentors:
#         graph.add_edge(author, commentor)

In [None]:
for post in posts:
    author = post.author
    commentors = [comment.author for comment in post.comments]
    for commentor in commentors:
        if commentor is not None:
            graph.add_edge(author, commentor)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [None]:
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, adj):
        x = F.relu(self.fc1(x))
        x = torch.mm(adj, x)
        x = self.fc2(x)
        return x
    
# Initialize the model and set the input dimension
input_dim = len(graph.nodes())
hidden_dim = 128
output_dim = 1
model = GNN(input_dim, hidden_dim, output_dim)

# Convert the networkx graph to an adjacency matrix and convert it to a tensor
adj = np.array(nx.adjacency_matrix(graph).todense())
adj = torch.tensor(adj, dtype=torch.float32)

# Train the model using an appropriate loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train for a number of epochs
for epoch in range(100):
    # Forward pass
    outputs = model(adj, adj)
    loss = criterion(outputs, labels)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item()}')

AttributeError: ignored