In [2]:
import pandas as pd

In [3]:
# We downloaded two datasets from the nasdaq ftp, nasdaqListed and otherlisted
# here we want to analyze the nasdaq downloaded dataset
df_nasdaq_list = pd.read_csv('datainputs/nasdaqlisted.txt', delimiter='|')
df_nasdaq_list.head(5)

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100,N,N
1,AACQ,Artius Acquisition Inc. - Class A Common Stock,S,N,N,100,N,N
2,AACQU,Artius Acquisition Inc. - Unit consisting of o...,S,N,N,100,N,N
3,AACQW,Artius Acquisition Inc. - Warrant,S,N,N,100,N,N
4,AAL,"American Airlines Group, Inc. - Common Stock",Q,N,N,100,N,N


In [4]:
# here we want to analyze the nasdaq dataset for the other stocks including NYSE
df_other_list = pd.read_csv('datainputs/otherlisted.txt', delimiter='|')
df_other_list.head(5)

Unnamed: 0,ACT Symbol,Security Name,Exchange,CQS Symbol,ETF,Round Lot Size,Test Issue,NASDAQ Symbol
0,A,"Agilent Technologies, Inc. Common Stock",N,A,N,100,N,A
1,AA,Alcoa Corporation Common Stock,N,AA,N,100,N,AA
2,AAA,Listed Funds Trust AAF First Priority CLO Bond...,P,AAA,Y,100,N,AAA
3,AAAU,Goldman Sachs Physical Gold ETF Shares,P,AAAU,Y,100,N,AAAU
4,AAC.U,"Ares Acquisition Corporation Units, each consi...",N,AAC.U,N,100,N,AAC=


In [5]:
# For this study we are planning on using just stocks Symbols and Names,
# to understand them we are going to print their statistics

In [6]:
df_nasdaq_list[['Symbol', 'Security Name']].describe()

Unnamed: 0,Symbol,Security Name
count,4191,4191
unique,4191,4186
top,SYRS,NASDAQ TEST STOCK
freq,1,4


In [7]:
df_other_list[['ACT Symbol', 'Security Name']].describe()

Unnamed: 0,ACT Symbol,Security Name
count,5706,5706
unique,5706,5686
top,PWC,NYSE Test One Common Stock
freq,1,6


In [8]:
# As we can see, most stocks are unique on each,
# which is a good initial sign that the dataset is probably good
# for now we are going to keep them like this and merge into a second dataframe
# to be used for lookups in the future.

In [9]:
#First we need to rename different columns to match on concat.
df_other_list.rename(columns={'ACT Symbol': 'Symbol'}, inplace=True)
#Now we can concat these datasets into a single stocks dataframe.
stocks = pd.concat([df_nasdaq_list[['Symbol', 'Security Name']],
                    df_other_list[['Symbol', 'Security Name']]])
stocks.to_csv('datawork/stocks.csv', sep='|')
stocks.describe()

Unnamed: 0,Symbol,Security Name
count,9897,9897
unique,9897,9871
top,PWC,NYSE Test One Common Stock
freq,1,6


In [10]:
# Now we have our initial stock dataset ready.
# lets keep this dataset to the side now for later usage.

In [11]:
# Next step lets get some r/wallStreetBets dataset, import and do some data exploration.

In [12]:
# First import into a Dataframe
df_wsb = pd.read_csv('datainputs/reddit_wsb.csv')

In [13]:
# lets see a sneak peek of the data,
# as you can see, the dataset has a lot of information,
# but for our case we will start exploring on title, body, timestamp and the unique identifier
df_wsb.head(5)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [14]:
#Now lets run some basic summary, to check on these attributes
print(df_wsb[['title', 'body', 'timestamp','id']].describe()
      .loc[['count','unique','freq']])

        title   body timestamp     id
count   36668  18534     36668  36668
unique  35795  18295     27008  36668
freq       37     17        14      1


In [15]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt') #tokenizer
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(wnl.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fernak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/fernak/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/fernak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/fernak/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/fernak/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:

wordlist = []

for title in df_wsb['title']:
    words = nltk.word_tokenize(lemmatize_sentence(title))
    for word in words:
        if not word in stop_words:
            wordlist.append(word)
print("size: " + str(len(wordlist)))

size: 353094


In [17]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def return_results(list_of_dicts, query, threshold):
    scores = []
    for index, item in enumerate(list_of_dicts):
        values = list(item.values())
        ratios = [fuzz.ratio(str(query), str(value)) for value in values] # ensure both are in string
        scores.append({ "index": index, "score": max(ratios)})

    filtered_scores = [item for item in scores if item['score'] >= threshold]
    sorted_filtered_scores = sorted(filtered_scores, key = lambda k: k['score'], reverse=True)
    filtered_list_of_dicts = [ list_of_dicts[item["index"]] for item in sorted_filtered_scores ]
    return filtered_list_of_dicts

# def wordListToFreqDict(wordlist):
#     wordfreq = [wordlist.count(p) for p in wordlist]
#     return dict(list(zip(wordlist,wordfreq)))
# #
# def sortFreqDict(freqdict):
#     aux = [(freqdict[key], key) for key in freqdict]
#     aux.sort()
#     aux.reverse()
#     return aux

# print(sortFreqDict(wordListToFreqDict(wordlist)))
stock_dict_list = stocks.to_dict()
return_results(stock_dict_list,'GameStop',10)
# stock_dict = pd.Series(stocks['Security Name'].values,index=stocks['Symbol']).to_dict()
# wd={}
# for word in wordlist:
#     # if stock_dict.get(word):
#         if word in wd:
#             wd[word] = wd[word]+1
#         else:
#             wd[word]=1
# print(sortFreqDict(wd))

AttributeError: 'str' object has no attribute 'values'

In [None]:
# for word in words:
# print(df_wsb['title'])

In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Instantiate the sentiment intensity analyzer with the existing lexicon
vader = SentimentIntensityAnalyzer()

# Update the lexicon
# New words and values
new_words = {
    'yolo': 100,
    'dd': 25,
    'double down': 25,
    'moon': 25,
    'dh':20,
    'diamond hands':20,
    'hold': 20,

    # Need to understand better the following ones
    'bullish': 10,
    'BTFD': 5,
    'FD': 5,

    'paper hands': -5,
    'bagholder': -5,
    'bearish':-10,

    # usual journalist stock jargon
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
}
vader.lexicon.update(new_words)

# scores = df_wsb['body'].astype(str).apply(vader.polarity_scores)
scores = df_wsb['title'].apply(vader.polarity_scores)
scores_df = pd.DataFrame.from_records(scores)
scored_news =pd.concat([df_wsb['timestamp'],df_wsb['title'],df_wsb['body'], scores_df], axis=1)
scored_news

# scores = df_wsb['body'].apply(vader.polarity_scores)
# scores_df = pd.DataFrame.from_records(scores)
# scored_news = pd.concat([df_wsb['body'], scores_df], axis=1)
# scored_news

In [None]:
# from gensim.models import Word2Vec
#
# model = Word2Vec()