In [1]:
import praw
import pandas as pd
import os
from datetime import datetime
from credentials import gf_id, gf_secret, gf_agent, gf_username, gf_password

In [2]:
reddit = praw.Reddit(
client_id = gf_id,
client_secret = gf_secret,
user_agent = gf_agent,
username = gf_username,
pasword = gf_password
)

In [3]:
def combine_data(posts, label):
    data = []
    for post in posts:
        data.append([post.title, post.selftext, post.subreddit])
    return data

### Scraping subreddits
- Getting title, text of posts and the respective subreddit <br>
- Combining in one dataframe

In [4]:
def subreddits_scraping(subreddits, folder):
    
    files = os.listdir(folder)
    doc = '_'.join(subreddits)
    
    if '.ipynb_checkpoints' in files:
        files.remove('.ipynb_checkpoints')
    
    for sub in subreddits:
        subreddit = reddit.subreddit(sub)

        posts_con = subreddit.controversial(limit=1000)
        posts_top_year = subreddit.top(limit=1000, time_filter="year")
        posts_top_month = subreddit.top(limit=1000, time_filter="month")
        posts_top_week = subreddit.top(limit=1000, time_filter="week")

        data_con = combine_data(posts_con, 'con')
        data_top_year = combine_data(posts_top_year, 'top_year')
        data_top_month = combine_data(posts_top_month, 'top_month')
        data_top_week = combine_data(posts_top_week, 'top_week')

        df = pd.DataFrame(data_con + data_top_year + data_top_month + data_top_week, columns = ['title', 'self_text', 'subreddit'])
        df = df.drop_duplicates()
        time_now = datetime.now().strftime('%d-%m-%H-%M')
        df.to_csv(folder+'/'+sub+time_now+'.csv', index=False, encoding='utf-8')

    subs = pd.concat([pd.read_csv(folder+'/'+file) for file in files])
    subs = subs[['title', 'self_text', 'subreddit']]
    subs.drop_duplicates(inplace=True)
    subs.to_csv(folder+'/'+doc+'.csv', index = False, encoding='utf-8') # encoding='utf-8'

In [5]:
subreddits_scraping(['marriage', 'dating'],'../scrapes')

In [7]:
marriage_dating = pd.read_csv('../scrapes/marriage_dating.csv')
marriage_dating.head()

Unnamed: 0,title,self_text,subreddit
0,What does it mean when a girl says I don’t fee...,I had a date yesterday and I thought it was ok...,dating
1,why do some guys get hurt/bothered if you slee...,My ex recently found out I hooked up with some...,dating
2,Worst era to ever date in as a guy.,Is there a single (attractive) young woman (ea...,dating
3,"Why do some men hate ""Vanilla women""?","Hiya. Earlier today I got called a ""vanilla wo...",dating
4,Guys the Victim mentality WONT help you,I've been couple of years in this sub and that...,dating


In [8]:
marriage_dating.shape

(7998, 3)

In [9]:
marriage_dating['subreddit'].value_counts(normalize=True)

dating      0.574394
Marriage    0.425606
Name: subreddit, dtype: float64

In [10]:
marriage_dating.isna().sum()

title          0
self_text    340
subreddit      0
dtype: int64

In [11]:
marriage_dating.dropna(inplace=True)
marriage_dating.isna().sum()

title        0
self_text    0
subreddit    0
dtype: int64

In [12]:
marriage_dating['subreddit'].value_counts(normalize=True)

dating      0.599373
Marriage    0.400627
Name: subreddit, dtype: float64

In [13]:
marriage_dating.to_csv('../output/marriage_dating.csv')