# COLLECTION PART

### SET UP THE ENVIRONMENT

In [None]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from reddit_utils import init_reddit, safe_api_call
import datetime as dt
import time

### PREPROCESS FUNCTION
Preprocess the text so that it is uniform and free of unnecessary elements that could distort the analysis.
After all transformations have been applied, the text is merged into a single string and returned.

In [None]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Set of stopwords
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Removing not printable characters
    text = ''.join(filter(lambda x: x in string.printable, text))
    # Removing XSLT tags
    text = re.sub(r'&lt;/?[a-z]+&gt;', '', text)
    text = text.replace(r'&amp;', 'and')
    text = text.replace(r'&gt;', '')
    # Removing newline, tabs and special reddit words
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('[deleted]', '').replace('[removed]', '')
    # Removing numbers
    text = re.sub(r'\w*\d+\w*', '', text)
    # Removing URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Removing Punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Removing extra spaces
    text = re.sub(r'\s{2,}', " ", text)

    # Handle emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

### COLLECT GROUND TRUTH DATA

We establish a connection via init_reddit().

Navigate through the highest-rated posts ever of a given subreddit, limiting the data to a specific time range
(20 January 2017 to 20 January 2020), the first 3 years of Tump's presidency.

Process the content of posts and comments that meet defined length criteria, to collect only
potentially meaningful content.

Implement a pause between the processing of each post to avoid exceeding the request limits of Reddit's API.
Distinguish between pro-Trump and anti-Trump subreddits by assigning corresponding labels to the data
collected from each subreddit.

Save the data in a CSV file for storage.

In [None]:
def fetch_groundtruth_data(subreddit, label, limit=1000):  # Limit the number of posts to fetch
    reddit = init_reddit()

    data = []
    for submission in safe_api_call(lambda: reddit.subreddit(subreddit).top(time_filter='all', limit=limit)): # Fetch the top rated posts ever
        created_time = dt.datetime.fromtimestamp(submission.created_utc)
        start_date = dt.datetime(2017, 1, 20)
        end_date = dt.datetime(2020, 1, 20)

        if not (start_date <= created_time <= end_date):
            continue

        # Skip AutoModerator posts
        if submission.author and submission.author.name == 'AutoModerator':
            continue

        # Fetch comments
        submission.comments.replace_more(limit=10)  # Retrieves all top-level comments by replacing "MoreComments" objects
        comments = submission.comments.list()

        # Process post
        if submission.is_self and (len(submission.title.split()) + len(submission.selftext.split()) >= 6):
            post_content = submission.title + " " + (submission.selftext if submission.selftext else "")
            data.append({
                'id': submission.id,
                'author': submission.author.name if submission.author else 'deleted',
                'content': preprocess(post_content),
                'created': created_time,
                'type': 'post',
                'label': label,
                'subreddit': subreddit
            })

        # Process comments
        for comment in comments:
            if len(comment.body.split()) >= 8:  # Filter short comments
                data.append({
                    'id': comment.id,
                    'author': comment.author.name if comment.author else 'deleted',
                    'content': preprocess(comment.body),
                    'created': dt.datetime.fromtimestamp(comment.created_utc),
                    'type': 'comment',
                    'label': label,
                    'subreddit': subreddit
                })

        time.sleep(10)  # Sleep between processing each submission to avoid hitting rate limits


    return data

# Specify the subreddits and labels
pro_trump_subreddits = ['AskTrumpSupporters']
anti_trump_subreddits = ['MarchAgainstTrump', 'AntiTrumpAlliance']

# Fetch data from each subreddit with appropriate labels
all_data = []
for subreddit in pro_trump_subreddits:
    all_data.extend(fetch_groundtruth_data(subreddit, label=1))  # Label 1 for Pro-Trump

for subreddit in anti_trump_subreddits:
    all_data.extend(fetch_groundtruth_data(subreddit, label=0))  # Label 0 for Anti-Trump

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_data)
df['created'] = pd.to_datetime(df['created'])

# Save the data to a CSV file
df.to_csv('polarized_reddit_posts_and_comments.csv', index=False)

### COLLECT COMMENTS OF CHOSEN TOPICS
Establish an authorised connection.

Set the specific date range.

For each topic of interest and its associated subreddits, collect comments from the top rated posts,
check that they have been created in the desired time interval, and verify that an author exists.

Apply the preprocess function to the comment content.

Implement a pause to prevent exceeding the request limits of the Reddit API.

In [None]:
def fetch_comments_by_topic(topics, limit=1000): # return up to 1000 posts
    reddit = init_reddit()
    topic_comments = {}
    start_date = dt.datetime(2017, 1, 20)
    end_date = dt.datetime(2020, 1, 20)

    for topic, subreddits in topics.items():
        comments = []
        for subreddit in subreddits:
            for submission in safe_api_call(lambda: reddit.subreddit(subreddit).top(time_filter='all', limit=limit)): # Fetch the top rated posts ever
                submission.comments.replace_more(limit=10)
                for comment in submission.comments.list():
                    created_time = dt.datetime.fromtimestamp(comment.created_utc)
                    # Ensure the comment was created within the specified time range
                    if start_date <= created_time <= end_date:
                        if comment.author:  # Ensure there is an author object
                            comments.append({
                                'id': comment.id,
                                'author': comment.author.name if comment.author else 'deleted',
                                'link_id': comment.link_id,  # submission ID that the comment belongs to
                                'parent_id': comment.parent_id,  # ID of the parent comment (prefixed with t1_). If it is a top-level comment, this returns the submission ID instead (prefixed with t3_)
                                'content': preprocess(comment.body),
                                'created': created_time
                            })

                time.sleep(10)

        topic_comments[topic] = comments
    return topic_comments


def save_comments_to_csv(topic_comments):
    for topic, comments in topic_comments.items():
        df = pd.DataFrame(comments)
        df.to_csv(f'{topic}_comments.csv', index=False)


topics = {
    'guncontrol': ['Firearms', 'guncontrol','gunpolitics'],
    'politics': ['Conservative', 'democrats', 'Republican'],
    'minority': ['racism', 'lgbt', 'askGSM']
}

topic_comments = fetch_comments_by_topic(topics)
save_comments_to_csv(topic_comments)