In [5]:
from psaw import PushshiftAPI
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime as dt
import praw
import re
import string
import nltk
from collections import defaultdict

from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language = "english")

from nltk.corpus import stopwords
from nltk.corpus import words
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package words to
[nltk_data]     /Users/ferasdahrooge/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ferasdahrooge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ferasdahrooge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Create a list of tickers based on https://www.nasdaq.com/market-activity/stocks/screener

tickers = pd.read_csv('listoftickers.csv')

tickers = tickers.Symbol.to_list()
tickers.remove("OC")
tickers.remove("DD")
tickers.remove("TA")

# Remove single character tickers
tickers = [str(x) for x in tickers if len(str(x)) != 1]

tickers.append("SPY")
tickers.append("V") #Visa
tickers.append("T") #AT&T
tickers.append("C") #Citigroup

# tickers = [str(x).lower() for x in tickers]

# Remove english words from tickers? Assume tickers will be all uppercase for now
# english_words = set(words.words('en'))
# tickers = [x for x in tickers if x not in english_words]
# tickers

In [None]:
subreddit = 'wallstreetbets'

# Columns we want. Full List here https://melaniewalsh.github.io/Intro-Cultural-Analytics/04-Data-Collection/14-Reddit-Data.html
filters = ['id', 'title', 'selftext', 'author', 'score', 'num_comments']

# Set some maximum number of posts
limit = 1000000

# Set minimum score
min_score = 100

# Include posts between these two dates
start_time = dt.datetime(2021, 2, 5)
end_time = dt.datetime(2022, 11, 15)

api = PushshiftAPI()

# Pull posts one day at a time, save as pickle
while(end_time <= dt.datetime(2021, 2, 6)):
    api_request_generator = api.search_submissions(
            subreddit=subreddit,   #Subreddit we want to audit
            after=int(start_time.timestamp()),      #Start date
            before=int(end_time.timestamp()),       #End date
            filter=filters,        #Column names we want to retrieve
            limit=limit)

    df = pd.DataFrame([submission.d_ for submission in api_request_generator])

    # Convert timestamp format to datetime
    df['Time of Creation'] = df['created_utc'].map(lambda t: dt.datetime.fromtimestamp(t))
    df = df.drop(['created_utc', 'created'], axis=1)

    df.to_pickle("./pickles/old score by day/{}.pkl".format(
        str(start_time.year) + '-' + str(start_time.month) + '-' + str(start_time.day)))
    
    start_time += dt.timedelta(days=1)
    end_time += dt.timedelta(days=1)
    

In [7]:
# Pushshift does not update the score
# Link to Reddit's API which we can use to get updated score based on the post ID

client_id = 'inJ6zJcKI7dSDOEnYtZtSg'
client_secret = 'HMVgC-yLRwpqsWAnYfPulFCna2ZbRA'
user_agent = 'ENSF612 - Pulling Score'
password = 'mengsoftware'
username = 'ensf612'

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    password=password,
    user_agent=user_agent,
    username=username,
)

In [None]:
time = dt.datetime(2021, 2, 7)
min_score = 100

# Import raw pickle by day, fix the score, 
while (time <= dt.datetime(2021, 5, 1)):

    try:
        df = pd.read_pickle("./pickles/old score by day/{}.pkl".format(
            str(time.year) + '-' + str(time.month) + '-' + str(time.day)))

    except FileNotFoundError:
        time += dt.timedelta(days=1)
        continue
    df['new_id'] = "t3_" + df["id"]

                        
    gentest = reddit.info(fullnames = df['new_id'].to_list())
    df['score'] = pd.Series([item.score for item in gentest])
    df = df.dropna(subset=['score'])
    
    df = df[df['score'] > min_score]
                        
    df.to_pickle("./pickles/new score by day/{}.pkl".format(
        str(time.year) + '-' + str(time.month) + '-' + str(time.day) + "_new"))
                        
    time += dt.timedelta(days=1)

In [9]:
# Based on Dr. Uddin's A1 Solution, stemming has been added

regex = re.compile('[^a-zA-Z]')

# Use regex to remove non-alpha characters
def removeNonAlpha(word):
    return regex.sub('', word)
 
# Remove any special characters
def remove_specialchars(word):
    if word is None or word == "":
        return word
    exclude = set(string.punctuation)    
    exclude.add('..')
    exclude.add('*')
    for c in exclude:
        word = word.strip(c)
    return word

# Use snowball stemmer to stem the words
def stem(word):
    return snowball.stem(word)

def default_zero():
    return 0

# pass the sentences through this function to preprocess the text
def preprocess_text(text_input):
  
    
    words = []
    ticker_dict = defaultdict(default_zero)
    
    # Tokenize into sentences, then into words
    sents = nltk.sent_tokenize(text_input)
    for sent in sents:
        for word in nltk.word_tokenize(sent):
            
            # Remove stop words, specials chars, non-alpha, and stem the words
            if word.lower() in stop_words: continue
            word = remove_specialchars(word)

            # Check if any of the words corresponds to a stock ticker
            # Case sensitive at this point (assuming tickers are all caps)
            # Make the words all lowercase (unless ticker)
            if word in tickers:
                ticker_dict[word] += 1
            else:
                word = word.lower()
  
            word = removeNonAlpha(word)
#             word = stem(word)
            
            # Remove words with length shorter than 3 characters if it's not a ticker
            if len(word) < 3 and word not in ticker_dict.keys(): continue
            
            if word is not None:
                words.append(word)
                
        
    # Return the preprocessed text
    return " ".join(words), ticker_dict

In [None]:
time = dt.datetime(2020, 11, 24)

# Drop rows that don't include ticker in title or body
while (time <= dt.datetime(2022, 11, 15)):
    try:
        df = pd.read_pickle("./pickles/new score by day/{}_new.pkl".format(
            str(time.year) + '-' + str(time.month) + '-' + str(time.day)))
    except FileNotFoundError:
        time += dt.timedelta(days=1)
        continue
        
    if df.empty:
        time += dt.timedelta(days=1)
        continue

    # Create a new column that combines title and body
    df['fulltext'] = df['title'] + " " + df['selftext']
    df.dropna(subset=['fulltext'])
    # Preprocess the title and body and determine if either contains a stock ticker
    df['fulltext_processed'], df['ticker'] = zip(*df['fulltext'].apply(preprocess_text))

    # Remove posts that don't contain a ticker in the title or body
    df = df[df['ticker'] != {}]     
                        
    df.to_pickle("./pickles/has ticker/{}.pkl".format(
        str(time.year) + '-' + str(time.month) + '-' + str(time.day) + "_ticker"))
                        
    time += dt.timedelta(days=1)

In [10]:
time = dt.datetime(2019,1,1)

while (time <= dt.datetime(2022, 11, 15)):
    try:
        df = pd.read_pickle("./pickles/has ticker/{}_ticker.pkl".format(
            str(time.year) + '-' + str(time.month) + '-' + str(time.day)))
    except FileNotFoundError:
        time += dt.timedelta(days=1)
        continue
        
    if df.empty:
        time += dt.timedelta(days=1)
        continue
    
    #get the Highest Ticker
    list_max = []
    ticker_dictionary_list = df['ticker'].tolist()
    for i in ticker_dictionary_list:
        list_max.append(max(i, key= i.get))
        
    df['ticker'] = list_max                
    df.to_pickle("./pickles/Main ticker/{}.pkl".format(
        str(time.year) + '-' + str(time.month) + '-' + str(time.day) + "_main_ticker"))
                        
    time += dt.timedelta(days=1)

In [12]:
time = dt.datetime(2019, 1, 1)
full_dataset = pd.DataFrame()

# Consolidate our data into one big dataframe
while (time <= dt.datetime(2022, 11, 14)):
    try:
        df = pd.read_pickle("./pickles/Main ticker/{}_main_ticker.pkl".format(
            str(time.year) + '-' + str(time.month) + '-' + str(time.day)))
    except FileNotFoundError:
        time += dt.timedelta(days=1)
        continue
        
    full_dataset = pd.concat([full_dataset, df], ignore_index=True)
    
    time += dt.timedelta(days=1)

full_dataset.to_csv('full_dataset_main.csv')
full_dataset.to_pickle('full_dataset_main.pkl')