In [56]:
import pandas as pd


In [57]:
# We downloaded two datasets from the nasdaq ftp, nasdaqListed and otherlisted
# here we want to analyze the nasdaq downloaded dataset
df_nasdaq_list = pd.read_csv('nasdaqlisted.txt', delimiter='|')
df_nasdaq_list.head(10)

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100.0,N,N
1,AACQ,Artius Acquisition Inc. - Class A Common Stock,S,N,N,100.0,N,N
2,AACQU,Artius Acquisition Inc. - Unit consisting of o...,S,N,N,100.0,N,N
3,AACQW,Artius Acquisition Inc. - Warrant,S,N,N,100.0,N,N
4,AAL,"American Airlines Group, Inc. - Common Stock",Q,N,N,100.0,N,N
5,AAME,Atlantic American Corporation - Common Stock,G,N,N,100.0,N,N
6,AAOI,"Applied Optoelectronics, Inc. - Common Stock",G,N,N,100.0,N,N
7,AAON,"AAON, Inc. - Common Stock",Q,N,N,100.0,N,N
8,AAPL,Apple Inc. - Common Stock,Q,N,N,100.0,N,N
9,AAWW,Atlas Air Worldwide Holdings - Common Stock,Q,N,N,100.0,N,N


In [58]:
# here we want to analyze the nasdaq dataset for the other stocks including NYSE
df_other_list = pd.read_csv('otherlisted.txt', delimiter='|')
df_other_list.head(10)

Unnamed: 0,ACT Symbol,Security Name,Exchange,CQS Symbol,ETF,Round Lot Size,Test Issue,NASDAQ Symbol
0,A,"Agilent Technologies, Inc. Common Stock",N,A,N,100.0,N,A
1,AA,Alcoa Corporation Common Stock,N,AA,N,100.0,N,AA
2,AAA,Listed Funds Trust AAF First Priority CLO Bond...,P,AAA,Y,100.0,N,AAA
3,AAAU,Goldman Sachs Physical Gold ETF Shares,P,AAAU,Y,100.0,N,AAAU
4,AAC.U,"Ares Acquisition Corporation Units, each consi...",N,AAC.U,N,100.0,N,AAC=
5,AADR,AdvisorShares Dorsey Wright ADR ETF,P,AADR,Y,100.0,N,AADR
6,AAIC,Arlington Asset Investment Corp Class A (new),N,AAIC,N,100.0,N,AAIC
7,AAIC$B,Arlington Asset Investment Corp 7.00%,N,AAICpB,N,100.0,N,AAIC-B
8,AAIC$C,Arlington Asset Investment Corp 8.250% Seies C...,N,AAICpC,N,100.0,N,AAIC-C
9,AAMC,Altisource Asset Management Corp Com,A,AAMC,N,100.0,N,AAMC


In [59]:
# For this study we are planning on using just stocks Symbols and Names,
# to understand them we are going to print their statistics

In [60]:
df_nasdaq_list[['Symbol', 'Security Name']].describe()

Unnamed: 0,Symbol,Security Name
count,4192,4191
unique,4192,4186
top,FRME,NASDAQ TEST STOCK
freq,1,4


In [61]:
df_other_list[['ACT Symbol', 'Security Name']].describe()

Unnamed: 0,ACT Symbol,Security Name
count,5707,5706
unique,5707,5686
top,BFT.U,NYSE Test One Common Stock
freq,1,6


In [62]:
# As we can see, most stocks are unique on each,
# which is a good initial sign that the dataset is probably good
# for now we are going to keep them like this and merge into a second dataframe
# to be used for lookups in the future.

In [63]:
#First we need to rename different columns to match on concat.
df_other_list.rename(columns={'ACT Symbol': 'Symbol'}, inplace=True)
#Now we can concat these datasets into a single stocks dataframe.
stocks = pd.concat([df_nasdaq_list[['Symbol', 'Security Name']],
                    df_other_list[['Symbol', 'Security Name']]])
stocks.describe()

Unnamed: 0,Symbol,Security Name
count,9899,9897
unique,9898,9871
top,File Creation Time: 0222202118:02,NYSE Test One Common Stock
freq,2,6


In [64]:
# Now we have our initial stock dataset ready.
# lets keep this dataset to the side now for later usage.

In [65]:
# Next step lets get some r/wallStreetBets dataset, import and do some data exploration.

In [66]:
# First import into a Dataframe
df_wsb = pd.read_csv('reddit_wsb.csv')

In [67]:
# lets see a sneak peek of the data,
# as you can see, the dataset has a lot of information,
# but for our case we will start exploring on title, body, timestamp and the unique identifier
df_wsb.head(10)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56
5,WE BREAKING THROUGH,405,l6uf7d,https://i.redd.it/2wef8tc062e61.png,84,1611862000.0,,2021-01-28 21:26:30
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1611862000.0,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31
8,Currently Holding AMC and NOK - Is it retarded...,200,l6ub4i,https://i.redd.it/6k2z7ouo42e61.png,161,1611862000.0,,2021-01-28 21:19:16
9,I have nothing to say but BRUH I am speechless...,291,l6uas9,https://i.redd.it/bfzzw2yo42e61.jpg,27,1611862000.0,,2021-01-28 21:18:37


In [68]:
#Now lets run some basic summary, to check on these attributes
print(df_wsb[['title', 'body', 'timestamp','id']].describe()
      .loc[['count','unique','freq']])

        title   body timestamp     id
count   36668  18534     36668  36668
unique  35795  18295     27008  36668
freq       37     17        14      1
