In [2]:
import praw
import sys
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt 
import prawcore

Here we are going to use python Reddit API  to extract comments from subreddits "askscience" 

In [3]:
#Getting credentials from reddit
reddit = praw.Reddit(client_id='iKArzqmTsQ0gug', \
                     client_secret='aZ1PZjHAM19GAq44SMVNoyF55go', \
                     user_agent='scraper', \
                     username='d_aragorn', \
                     password='d1t1sc32nc2')

In [4]:
# Object that contains the subreddit askscience
subreddit = reddit.subreddit('askscience')


In [5]:
# Getting  the top 1000 subreddit
top_subreddit = subreddit.top(limit=1000)

In [6]:
# Dictionary with the columns of interest
subreddits_dict = {"author":[],"title":[],"score":[],"id":[], "url":[], "comms_num": [],"created": [],"body":[]}

In [7]:
# Let us get the attributes of subreddit. 
for submission in top_subreddit:
    subreddits_dict["title"].append(submission.title)
    subreddits_dict["score"].append(submission.score)
    subreddits_dict["id"].append(submission.id)
    subreddits_dict["url"].append(submission.url)
    subreddits_dict["comms_num"].append(submission.num_comments)
    subreddits_dict["created"].append(submission.created)
    subreddits_dict["body"].append(submission.selftext)
    subreddits_dict["author"].append(submission.author)
   

In [8]:
# Create a dataframe from the dictionary
subreddits_data = pd.DataFrame(subreddits_dict)

In [9]:
# Column timestamp is of type string, let us convert it to type timestamp
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = subreddits_data["created"].apply(get_date)

subreddits_data = subreddits_data.assign(timestamp = _timestamp)

del subreddits_data['created']

In [10]:
subreddits_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 8 columns):
author       960 non-null object
title        999 non-null object
score        999 non-null int64
id           999 non-null object
url          999 non-null object
comms_num    999 non-null int64
body         999 non-null object
timestamp    999 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 62.5+ KB


I am interested in studying the relationship between the score, the number of comments 
and characteristics associated with the author and/or the title of the reddit. Two important attributes related to the reddit author are comment_karma and  link_karma.

In [14]:
# Function to get comment_karma and link_karma from authors
#Function to obtain comment_karma and link_karma from the authors
def get_comment_karma(redditor): 
    #Params:
    #redditor is a object type Redditor with name as parameter
    
    #Return:
    # comment_karma
    
    if redditor: # Check if the author is not NaN
        try: 
            # If there is an author then get comment_karma else return thr type of error encountered
            c_karma=reddit.redditor(redditor.name).comment_karma  
            return  int(c_karma) 
        except Exception as e:
            return (redditor.name,sys.exc_info()[0])
    else:
        return np.nan # Return NaN if there is not author
    
def get_link_karma(redditor):
    if redditor:
        try:
            lnk_karma=reddit.redditor(redditor.name).link_karma  
            return int(lnk_karma) 
        except Exception as e:
            return (redditor.name,sys.exc_info()[0])
    else:
        return np.nan
    


In [15]:
# Adding comments_karma and links_karma to our datasets
subreddits_data['author_comments_karma'] = subreddits_data.author.apply(get_comment_karma)
subreddits_data['author_links_karma'] = subreddits_data.author.apply(get_link_karma)

In [17]:
subreddits_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 10 columns):
author                   960 non-null object
title                    999 non-null object
score                    999 non-null int64
id                       999 non-null object
url                      999 non-null object
comms_num                999 non-null int64
body                     999 non-null object
timestamp                999 non-null datetime64[ns]
author_comments_karma    960 non-null object
author_links_karma       960 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 78.1+ KB


In [18]:
# Saving data in a csv file
subreddits_data.to_csv ('subreddits_data.csv',sep=',',encoding='utf-8',index = None, header=True)

In [19]:
reddit_data=pd.read_csv('subreddits_data.csv')  
reddit_data.head()

Unnamed: 0,author,title,score,id,url,comms_num,body,timestamp,author_comments_karma,author_links_karma
0,MockDeath,Help us fight for net neutrality!,83385,7etmk7,https://www.reddit.com/r/askscience/comments/7...,1193,The ability to browse the internet is at risk....,2017-11-22 22:38:53,33852,10762
1,AskScienceModerator,Stephen Hawking megathread,65826,84auzr,https://www.reddit.com/r/askscience/comments/8...,1666,"We were sad to learn that noted physicist, cos...",2018-03-14 09:03:48,1662,290641
2,MockDeath,A message to our users,39287,3by2nk,https://www.reddit.com/r/askscience/comments/3...,1193,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Today in AskScie...,2015-07-03 06:43:47,33852,10762
3,This31415926535,Why is it that during winter it's not uncommon...,37670,7yx524,https://www.reddit.com/r/askscience/comments/7...,976,I live in the USA Midwest,2018-02-20 19:23:37,1364,10337
4,GPL89,If we could travel at 99.9% the speed of light...,34364,ahyapf,https://www.reddit.com/r/askscience/comments/a...,3094,,2019-01-20 17:57:37,4794,13561
