### Problem Statement

As a data Scientist working for a financial advisory company, we have been tasked to classify investors who require basic savings/investment plans from higher net worth individuals who prefer higher growth investment plans. 

Through comparing 2 different subreddits that are reflective of such financially challenged or risky high net worth individuals, we aim to build a classification model and deploy on other forums so that our financial advisors can reach out to individuals with a suitable financial plan. This will thus result in greater conversion rate and more profits for the company. The subreddits that we will be choosing are r/wallstreetbets for investors looking for higher risk high growth plans and r/povertyfinance for investors who are better suited for a basic savings/investment plan.

## Data Scraping

In [1]:
import numpy as np
import pandas as pd
import requests
import datetime as dt
from dateutil.parser import *
from tqdm.notebook import tqdm
import random
import time
import json
import praw
from psaw import PushshiftAPI

In [6]:
# function to scrape data
def getPushshiftData(before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=' \
            +str(sub)+'&before='+str(before)
    
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']
    

In [13]:
# function to store data and rerun getpushshiftdata function till we get 10k posts
def parse_posts(before, sub):
    
    counter = 0
    
    list_of_dfs = []
    data = getPushshiftData(before, sub)
    
    # counter to ensure we get 10k posts
    while counter< 10000:
        current_df = pd.DataFrame(data)
        
        print('Added: ' + str(current_df.shape[0]))

        print('Date from: ' +str(dt.datetime.fromtimestamp(data[-1]['created_utc'])))
        print('Date to: ' + str(dt.datetime.fromtimestamp(data[0]['created_utc'])))

        list_of_dfs.append(current_df)
        counter += len(data)
        print('Counter: ' + str(counter))
        
        #ensures that we keep updating the last date so we can get earlier posts
        
        before = data[-1]['created_utc']
        
        try: 
            data = getPushshiftData(before, sub)

        except:

            print('error 1')
            sleep_duration = random.randint(1,3)
            print(sleep_duration)
            time.sleep(sleep_duration)

            try: 
                data = getPushshiftData(before, sub)

            except:
                print('error 2')
                sleep_duration = random.randint(1,3)
                print(sleep_duration)
                time.sleep(sleep_duration)
                data = getPushshiftData(before, sub)
                    
    return list_of_dfs

In [14]:
#getting posts from current time and storing in csv for analysis

unix_now = round(time.time())

wsb_list = parse_posts(unix_now, 'wallstreetbets')

wsb_df = pd.concat(wsb_list, ignore_index=True)
display(wsb_df.head())
wsb_df.to_csv('./datasets/wallstreetbets_df.csv', index=False)

https://api.pushshift.io/reddit/search/submission/?subreddit=wallstreetbets&before=1648552593
Added: 25
Date from: 2022-03-29 17:37:05
Date to: 2022-03-29 19:05:39
Counter: 25
https://api.pushshift.io/reddit/search/submission/?subreddit=wallstreetbets&before=1648546625
Added: 25
Date from: 2022-03-29 16:04:19
Date to: 2022-03-29 17:35:59
Counter: 50
https://api.pushshift.io/reddit/search/submission/?subreddit=wallstreetbets&before=1648541059
Added: 25
Date from: 2022-03-29 14:18:12
Date to: 2022-03-29 15:37:33
Counter: 75
https://api.pushshift.io/reddit/search/submission/?subreddit=wallstreetbets&before=1648534692
Added: 25
Date from: 2022-03-29 13:29:48
Date to: 2022-03-29 14:13:55
Counter: 100
https://api.pushshift.io/reddit/search/submission/?subreddit=wallstreetbets&before=1648531788
Added: 25
Date from: 2022-03-29 12:02:10
Date to: 2022-03-29 13:24:48
Counter: 125
https://api.pushshift.io/reddit/search/submission/?subreddit=wallstreetbets&before=1648526530
Added: 25
Date from: 202

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,author_flair_template_id,gallery_data,media,media_embed,secure_media,secure_media_embed,author_cakeday,live_audio,poll_data,banned_by
0,[],False,Eurymanthus,,[],,text,t2_h8m31j98,False,False,...,,,,,,,,,,
1,[],False,roxredd,,[],,text,t2_4a5t4q0e,False,False,...,,,,,,,,,,
2,[],False,Some-rando_,,[],,text,t2_if6zz3kt,False,False,...,,,,,,,,,,
3,[],False,gsm1022,,[],,text,t2_6exlq346,False,False,...,,,,,,,,,,
4,[],False,gsm1022,,[],,text,t2_6exlq346,False,False,...,,,,,,,,,,


In [15]:
#getting posts from current time and storing in csv for analysis

pfinance_list = parse_posts(unix_now, 'povertyfinance')

pfinance_df = pd.concat(pfinance_list, ignore_index=True)
display(pfinance_df.head())
pfinance_df.to_csv('./datasets/pfinance_df.csv', index=False)

https://api.pushshift.io/reddit/search/submission/?subreddit=povertyfinance&before=1648552593
Added: 25
Date from: 2022-03-28 22:30:05
Date to: 2022-03-29 18:26:41
Counter: 25
https://api.pushshift.io/reddit/search/submission/?subreddit=povertyfinance&before=1648477805
Added: 25
Date from: 2022-03-28 03:29:50
Date to: 2022-03-28 22:27:07
Counter: 50
https://api.pushshift.io/reddit/search/submission/?subreddit=povertyfinance&before=1648409390
Added: 25
Date from: 2022-03-27 05:58:33
Date to: 2022-03-28 03:12:57
Counter: 75
https://api.pushshift.io/reddit/search/submission/?subreddit=povertyfinance&before=1648331913
Added: 25
Date from: 2022-03-26 09:50:27
Date to: 2022-03-27 05:01:02
Counter: 100
https://api.pushshift.io/reddit/search/submission/?subreddit=povertyfinance&before=1648259427
Added: 25
Date from: 2022-03-25 21:39:13
Date to: 2022-03-26 09:45:00
Counter: 125
https://api.pushshift.io/reddit/search/submission/?subreddit=povertyfinance&before=1648215553
Added: 25
Date from: 202

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,author_cakeday,author_flair_template_id,author_flair_text_color,media_metadata,author_flair_background_color,distinguished,edited,banned_by,removal_reason,gilded
0,[],False,Which_Ebb_5978,,[],,text,t2_kco8kcei,False,False,...,,,,,,,,,,
1,[],False,Sufficient_Tooth_949,,[],,text,t2_7nevkexv,False,False,...,,,,,,,,,,
2,[],False,deskofanxiety,,[],,text,t2_594nzwq4,False,False,...,,,,,,,,,,
3,[],False,ProgramMaster8885,,[],,text,t2_k563ccqb,False,False,...,,,,,,,,,,
4,[],False,fasnysry,,[],,text,t2_iy80gmgq,False,False,...,,,,,,,,,,
