In [None]:
import pandas as pd

## Data processing ##

### Define the data structure of rows ###

In [1]:
# is_gold - true/false - If true, will only match if the author has reddit gold. If false, will only match if they do not have gold.
# is_submitter - true/false - (only relevant when checking comments) If true, will only match if the author 
# was also the submitter of the post being commented inside. If false, will only match if they were not.
# send_replies – When True, messages will be sent to the submission author when comments are made to the submission

data_type = {"subreddit": "string", "subreddit_id": "string", "subreddit_type": "string", "author": "string", "body" : "string", 
            "created_date" : "string", "created_utc": "string", "retrieved_on" : "string", 
            "id": "string", "parent_id": "string", "link_id": "string", "score": "int", "total_awards_received": "int", 
            "controversiality": "int", "gilded": "int", 
            "collapsed_because_crowd_control": "int", "collapsed_reason": "string", "distinguished": "string", "removal_reason": "string",
            "author_created_utc": "string", "author_fullname": "string", "author_patreon_flair": "bool", "author_premium": "bool",
            "can_gild": "bool", "can_mod_post": "bool", "collapsed": "bool", "is_submitter": "bool", "_edited": "string", "locked": "bool",
            "quarantined": "bool", "no_follow": "bool", "send_replies": "bool", "stickied": "bool", "author_flair_text": "string"}

### read all the csv files using the defined data structure, remove deleted rows ###
Some of the rows are marked as "deleted" which indicate the contents had been removed for some reasons. We need to remove then as they do not contribute to our analysis.

the folder of process data is "./data". For testing, please download the data (xxxx.csv) into this folder firstly. The resulted data is called xxxx_iphone_revised.csv. They will be stored in the "./data". 

In [2]:
# read the data and remove [deleted] row
# we comment out the codes as we already process the data


for x in range(2008, 2020):
    df = pd.read_csv(f'./data/{x}.csv')
    df = df[(df.body != '[deleted]') & (df.body != '[removed]')]
    df.drop(columns = ['Unnamed: 0'], inplace = True)
    df.to_csv(f'./data/{x}_revised.csv')



"\nfor x in range(2008, 2020):\n    df = pd.read_csv(f'./data/{x}.csv')\n    df = df[(df.body != '[deleted]') & (df.body != '[removed]')]\n    df.drop(columns = ['Unnamed: 0'], inplace = True)\n    df.to_csv(f'./data/{x}_revised.csv')\n"

### Identify All Posts Related to iPhone on Reddit ###

We conducted a review and found that the subreddits apple and iphone contain the most relevant discussions about the iPhone. However, the 'apple' subreddit also includes posts about other Apple products like Macs, iPod Touch, and iTunes. To ensure relevance, we will filter out these non-iPhone related posts. Our approach assumes that all posts within a single discussion thread focus on the same topic. Therefore, we will retain any thread in the 'apple' subreddit if at least one post within that thread mentions the iPhone."

In [3]:
def locate_iphone_post(data):
    """
    Find the parent_ids that contain discussions about iPhone.
    """
    df_apple = data[data.subreddit == 'apple']
    related_discussion = df_apple[df_apple.body.str.contains('iphone', flags = re.IGNORECASE)]
    return(related_discussion.link_id.unique())

In [4]:
def extract_all_iphone_post(data):
    ids = locate_iphone_post(data)
    return(data[(data.subreddit == 'iphone') | (data.link_id.isin(ids))])

In [5]:
# final clean, only retain iPhone related posts each year

def get_iphone_data_yearly(data, filename = None):
    """
    """
    df = extract_all_iphone_post(data)
    df.drop(columns = ['Unnamed: 0'], inplace = True)
    if filename != None:
        df.to_csv(filename)
    return(df)

The resulted data is called xxxx_iphone_v2.csv. They will be stored in the "./data". 

In [None]:
# comment the following code because we already run it


for x in range(2008, 2020):
    file_name = f'./data/{x}_revised.csv'
    save_path = f'./data/{x}_iphone_v2.csv'
    data = pd.read_csv(file_name, dtype=data_type, header = 0)
    get_iphone_data_yearly(data, filename = save_path)
