# Set up

In [1]:
import praw
import re
import os
import requests
import pandas as pd

In [2]:
# constants
MAX_POSTS = 10
MAX_TOP_LEVEL = 50
MAX_REPLIES = 25

In [3]:
# Reddit API login
reddit = praw.Reddit( "[FILL WITH INFO]" )

# subreddit to browse
subreddit = reddit.subreddit("apple")

In [4]:
# setting up an empty dataframe
df = pd.DataFrame(columns=["id", "Original Text"])
df

Unnamed: 0,id,Original Text


In [5]:
df

Unnamed: 0,id,Original Text


# Scraping the data

In [6]:
# get the top 10 newest posts
# for submission in subreddit.new(limit=10):
for submission in subreddit.new(limit=MAX_POSTS):
    print("---------------------------------------------------------")
    print(submission.title)

    # adding the post title + body
    df = df.append({'id':submission.id, 'Original Text': submission.title + "\n" + submission.selftext}, ignore_index=True)
    
    # get comments
    submission.comments.replace_more(limit=None)
    top_level = list(submission.comments)
    #print(len(top_level))
    
    # if the post has at least one comment
    if len(top_level) > 0:
        i = 0
        # iterating until all or 50 top-level comments gathered
        while i < len(top_level) and i < MAX_TOP_LEVEL:
            # get current comment
            curr_comment = top_level[i]
            #print("i: {}".format(i))
            
            # get comment body text
            #print("\t" + curr_comment.body)
            df = df.append({'id':curr_comment.id, 'Original Text': curr_comment.body}, ignore_index=True)
            
            # get comment replies
            replies = curr_comment.replies.list()
            
            # if there is at least one reply
            if len(replies) > 0:
                j = 0
                # iterating until all or 25 replies replies gathered
                while j < len(replies) and j < MAX_REPLIES:
                    # get current reply
                    curr_reply = replies[j]
                    #print("{}.{}".format(i, j))

                    # get reply body text
                    #print("\t\t" + curr_reply.body)
                    df = df.append({'id':curr_reply.id, 'Original Text': curr_reply.body}, ignore_index=True)

                    j += 1
            
            i += 1
            

---------------------------------------------------------
Developers or add advertisers, please respect our sound settings in games for the adds shown in games or apps
---------------------------------------------------------
Question: who is the artist behind the new apple ads with the stretched bodies and outrageous perspective?
---------------------------------------------------------
Paving the Road to Vulkan on Asahi Linux [Native Linux on Apple Silicon]
---------------------------------------------------------
Microsoft Wants to Launch Xbox Games Store on iPhone
---------------------------------------------------------
iPhone 15 Pro Leak Reveals Unified Volume Button and Mute Button
---------------------------------------------------------
Apple to open flagship India store in Mumbai in April; second store to come up in Delhi
---------------------------------------------------------
Galaxy S23 Ultra vs iPhone 14 Pro Max Battery Test
-----------------------------------------------

In [7]:
df

Unnamed: 0,id,Original Text
0,11wtrmb,"Developers or add advertisers, please respect ..."
1,jd0o26l,"Agree, and it’s why my system volume is often ..."
2,jd0v1x4,This is often not up to the developer. It’s up...
3,11wloaz,Question: who is the artist behind the new app...
4,jd0kn9b,"This is all I could find about their new ad, w..."
...,...,...
806,jcxnalt,its just you
807,jd0kps3,Guess I'm done with this show.
808,jcyh93g,If the Biden's actually cared about mental hea...
809,jcyz755,The. Morning Show


# Cleaning the data

In [26]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ksnbx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ksnbx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ksnbx\AppData\Roaming\nltk_data...


True

In [27]:
def clean(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english')
    
    # break into tokens
    tokens = nltk.word_tokenize(text)
    
    # lowercase the text
    lower = [word.lower() for word in tokens]
    
    # remove stopwords
    no_stopwords = [word for word in lower if word not in stopword]
    
    # remove non-alphanumeric characters
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    
    # lemmatize the tokens
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    
    clean_text = lemm_text
    return clean_text

Thank you [Ona_Gilbert](https://www.kaggle.com/code/onadegibert/sentiment-analysis-with-tfidf-and-random-forest) for pointing us to the `nltk` library

In [28]:
df['Cleaned Text'] = df['Original Text'].apply(clean)
df

Unnamed: 0,id,Original Text,Cleaned Text
0,11wtrmb,"Developers or add advertisers, please respect ...","[developer, add, advertiser, please, respect, ..."
1,jd0o26l,"Agree, and it’s why my system volume is often ...","[agree, system, volume, often, since, mute, to..."
2,jd0v1x4,This is often not up to the developer. It’s up...,"[often, developer, ad, network, developer, usu..."
3,11wloaz,Question: who is the artist behind the new app...,"[question, artist, behind, new, apple, ad, str..."
4,jd0kn9b,"This is all I could find about their new ad, w...","[could, find, new, ad, think, talking, http]"
...,...,...,...
806,jcxnalt,its just you,[]
807,jd0kps3,Guess I'm done with this show.,"[guess, done, show]"
808,jcyh93g,If the Biden's actually cared about mental hea...,"[biden, actually, cared, mental, health, reign..."
809,jcyz755,The. Morning Show,"[morning, show]"


In [33]:
df['Untokenized Clean'] = df['Cleaned Text'].map(lambda t: " ".join(t))
df

Unnamed: 0,id,Original Text,Cleaned Text,Untokenized Clean
0,11wtrmb,"Developers or add advertisers, please respect ...","[developer, add, advertiser, please, respect, ...",developer add advertiser please respect sound ...
1,jd0o26l,"Agree, and it’s why my system volume is often ...","[agree, system, volume, often, since, mute, to...",agree system volume often since mute toggle lo...
2,jd0v1x4,This is often not up to the developer. It’s up...,"[often, developer, ad, network, developer, usu...",often developer ad network developer usually c...
3,11wloaz,Question: who is the artist behind the new app...,"[question, artist, behind, new, apple, ad, str...",question artist behind new apple ad stretched ...
4,jd0kn9b,"This is all I could find about their new ad, w...","[could, find, new, ad, think, talking, http]",could find new ad think talking http
...,...,...,...,...
806,jcxnalt,its just you,[],
807,jd0kps3,Guess I'm done with this show.,"[guess, done, show]",guess done show
808,jcyh93g,If the Biden's actually cared about mental hea...,"[biden, actually, cared, mental, health, reign...",biden actually cared mental health reign billi...
809,jcyz755,The. Morning Show,"[morning, show]",morning show


# Sentiment analysis using VADER
More info on VADER can be [found here](https://github.com/cjhutto/vaderSentiment)

In [29]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [35]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

df['vader_polarity'] = df['Untokenized Clean'].map(lambda text: sid.polarity_scores(text)['compound'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ksnbx\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [36]:
df

Unnamed: 0,id,Original Text,Cleaned Text,Untokenized Clean,vader_polarity
0,11wtrmb,"Developers or add advertisers, please respect ...","[developer, add, advertiser, please, respect, ...",developer add advertiser please respect sound ...,0.9882
1,jd0o26l,"Agree, and it’s why my system volume is often ...","[agree, system, volume, often, since, mute, to...",agree system volume often since mute toggle lo...,0.3612
2,jd0v1x4,This is often not up to the developer. It’s up...,"[often, developer, ad, network, developer, usu...",often developer ad network developer usually c...,0.4939
3,11wloaz,Question: who is the artist behind the new app...,"[question, artist, behind, new, apple, ad, str...",question artist behind new apple ad stretched ...,0.4131
4,jd0kn9b,"This is all I could find about their new ad, w...","[could, find, new, ad, think, talking, http]",could find new ad think talking http,0.0000
...,...,...,...,...,...
806,jcxnalt,its just you,[],,0.0000
807,jd0kps3,Guess I'm done with this show.,"[guess, done, show]",guess done show,0.0000
808,jcyh93g,If the Biden's actually cared about mental hea...,"[biden, actually, cared, mental, health, reign...",biden actually cared mental health reign billi...,-0.1280
809,jcyz755,The. Morning Show,"[morning, show]",morning show,0.0000


# Saving the data

In [40]:
from datetime import date

In [49]:
filename = "reddit_" + str(date.today()) + ".csv"
df.to_csv("data/" + filename, index=False)