# NER for Extracting Stock Mentions on Reddit

## Data Extraction

This project is copied and editted from the following project:

https://towardsdatascience.com/ner-for-extracting-stock-mentions-on-reddit-aa604e577be

# We build the class to extract data from Reddit API

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import requests
import pandas as pd
from tqdm import tqdm

class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in tqdm(range(iters)):
            #print(i+1, 'iteration')
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df

# Create Reddit Account to use the API

Follow this link to learn how to use the reddit API

https://towardsdatascience.com/how-to-use-the-reddit-api-in-python-5e05ddfd1e5c

Save the username and password to "code.txt".

In [2]:
f = open("code.txt", "r")
lines = f.readlines()

CLIENT_ID = 'mxE3AUoy4bVKv6sQVE1UPg'
SECRET_TOKEN = 'QdF0DSYTBAEQjIHvghjOWhp41IE16Q'
USER = lines[0].replace('\n','')
PWD = lines[1]

In [3]:
reddit = Reddit(CLIENT_ID, SECRET_TOKEN, USER, PWD)

# List stock/investing related subreddit for NER extraction

In [4]:
SUB_list = [
            'investing',
            'wallstreetbets',
            'stocks',
            'pennystocks',
            'robinhood',
            'GME',
            'amcstock',
            'Cryptocurrency',
            'smallstreetbets',
            'traders',
            'Wallstreetbetsnew',
            'options',
            'StockMarket',
            'ethtrader',
           ]

# Loop through subreddit and save the data

In [5]:
for SUB in SUB_list:
    print('\n' + SUB)
    data = reddit.get_new(SUB, 20)
    data = data.replace({'|': ''}, regex=True)
    data.to_csv(f'./data/reddit_{SUB}_NEW.csv', sep='|', index=False)


investing


 50%|█████     | 10/20 [01:00<01:00,  6.09s/it]


No more found

wallstreetbets


 50%|█████     | 10/20 [01:38<01:38,  9.90s/it]


No more found

stocks


 50%|█████     | 10/20 [01:20<01:20,  8.00s/it]

No more found






pennystocks


 45%|████▌     | 9/20 [00:52<01:04,  5.86s/it]

No more found






robinhood


 20%|██        | 4/20 [00:22<01:29,  5.58s/it]


No more found

GME


 50%|█████     | 10/20 [00:59<00:59,  5.90s/it]


No more found

amcstock


 50%|█████     | 10/20 [01:15<01:15,  7.57s/it]


No more found

Cryptocurrency


 50%|█████     | 10/20 [01:34<01:34,  9.47s/it]


No more found

smallstreetbets


 50%|█████     | 10/20 [01:12<01:12,  7.28s/it]


No more found

traders


 40%|████      | 8/20 [00:30<00:45,  3.77s/it]


No more found

Wallstreetbetsnew


 45%|████▌     | 9/20 [01:03<01:17,  7.00s/it]


No more found

options


 45%|████▌     | 9/20 [00:59<01:13,  6.65s/it]


No more found

StockMarket


 45%|████▌     | 9/20 [02:02<02:30, 13.66s/it]

No more found

ethtrader



 45%|████▌     | 9/20 [01:02<01:15,  6.90s/it]

No more found



