### The `dnalab` Python package makes it easy to access data.

In [3]:
import dnalab

Here you can specify any file or folder your notebook might use based on this example.

In [4]:
# required_file = "file.csv"
# required_folder = "folder"
# required_code = "code.py"

From the Morningstar Extension on the left, try dragging into the notebook a table of data, Direct list, or Direct Search Criteria.

In [5]:
import spacy 
import pandas as pd
import os

In [6]:
PROJECT_PATH = os.getcwd()
POSTS_PATH = 'data/reddit_posts'
COMMENTS_PATH = 'data/reddit_comments'
POSTS_DATA_PATH = os.path.join(PROJECT_PATH, POSTS_PATH)
COMMENTS_DATA_PATH = os.path.join(PROJECT_PATH, COMMENTS_PATH)
print(POSTS_DATA_PATH, COMMENTS_DATA_PATH)

/home/rochak.chandra@morningstar.com/Rochak/wsb-hackathon/data/reddit_posts /home/rochak.chandra@morningstar.com/Rochak/wsb-hackathon/data/reddit_comments


### Getting Dataframes

In [7]:
def get_df(input_path, output_path):
    pd.set_option('display.max_columns', None)
    all_files = glob.glob(input_path + "/*.csv")

    dfs = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None)
        dfs.append(df)

    df = pd.concat(dfs, axis=0, ignore_index=True)
    non_nan_df = df[df['text'].notna()]
    filtered_df = non_nan_df[(non_nan_df['text'] != "[removed]") & (
        non_nan_df['text'] != '[deleted]')]
    filtered_df['date'] = filtered_df['timestamp'].apply(
        lambda x: datetime.fromtimestamp(x)).tolist()
    filtered_df.set_index('date', inplace=True)
    final_df = filtered_df.drop(filtered_df.columns[0], axis=1)
    final_df = final_df.sort_index()
    final_df = final_df[final_df.index.month == 3]
    final_df.to_csv(output_path)
    

    
def get_input(data_path, column):
    df = pd.read_csv(data_path, index_col='date', parse_dates=True)
    res = df[column].tolist()
    return res

In [8]:
BLACKLIST = ['bot', 'nyse']

### Extract Symbols

In [None]:
def process(raw_comment, nlp):
    doc = nlp(raw_comment)
    res = []
    for ent in doc.ents:
        if ent.text.lower() not in BLACKLIST and ent.label_ == "ORG":
            res += ent
    return list(set(res))

In [12]:
posts_df = pd.read_csv(os.path.join(PROJECT_PATH, 'data/posts_df.csv'), parse_dates=True)

In [13]:
comments_df = pd.read_csv(os.path.join(PROJECT_PATH, 'data/comments_df.csv'), parse_dates=True)

In [14]:
posts_df.head(10)

Unnamed: 0,date,post_id,text,title,upvote_ratio,timestamp,author
0,2021-03-02 09:25:06,lw4h9d,"So loving this GME still going strong, that I ...",GME and the righteous options?,1.0,1614698706,Kick-Effective
1,2021-03-02 09:25:57,lw4i0b,Is this being pushed here. Been shareholder f...,Rocket Companies,1.0,1614698757,Master-Unit6692
2,2021-03-02 09:26:04,lw4i3o,"Gentlemen,\n\nNow that the yield bubble is pop...",Cathie Wood said PLTR is mandatory,1.0,1614698764,boomerbubblebuster
3,2021-03-02 09:26:29,lw4ihe,CVSI is on the move up! Buy low sell high! Hav...,CVSI,1.0,1614698789,BigShr1mp
4,2021-03-02 09:26:36,lw4ilh,There is probably some rule against this threa...,NOT the GME thread but share your thoughts here,1.0,1614698796,SwedishKillerChef
5,2021-03-02 09:26:47,lw4is2,Fk these mods,Gee Emm Eey 3/2/21,1.0,1614698807,Mosaikmuster
6,2021-03-02 09:27:06,lw4j12,Because apparently so,"RKT MEGATHREAD March 2, 2020",1.0,1614698826,iamzyb
7,2021-03-02 09:27:42,lw4jj8,🚀🚀🚀 BankOfSabadell 🚀🚀🚀🚀\n\n*Processing img 0b...,BankOfSanadell -- Spain,1.0,1614698862,NoFlySS
8,2021-03-02 09:27:57,lw4jqn,Mods surely know that no GME thread = spam in ...,Something fishy is going on?,1.0,1614698877,Crowbar_Freeman
9,2021-03-02 09:28:07,lw4juw,CBD Life Sciences Inc. is up today. It's a C...,CBDL Up,1.0,1614698887,ThisGuyAlright


In [15]:
comments_df.head(10)

Unnamed: 0,date,comment_id,text,score,awards,timestamp,author,\t\t
0,2021-03-02 09:27:05,gpf8jcq,Heavy resistance at 124! Needs to move through...,1,0,1614698825,jsams14,
1,2021-03-02 09:27:25,gpf8ky5,Honestly mods why didn't you make an actual GM...,2,0,1614698845,SwedishKillerChef,
2,2021-03-02 09:28:03,gpf8nvm,Squeeze,2,0,1614698883,elliskj1979,
3,2021-03-02 09:28:07,gpf8o7q,And there she goes (:,2,0,1614698887,SARMY1K,
4,2021-03-02 09:29:00,gpf8sfw,Boom you see how quick she ran after blasting ...,1,0,1614698940,jsams14,
5,2021-03-02 09:29:38,gpf8vfy,No shit!\n\nhttps://fintel.io/ss/us/rkt,1,0,1614698978,robertleeblairjr,
6,2021-03-02 09:29:40,gpf8vly,Let's just talk GME here,3,0,1614698980,HouseDowningVicodin,
7,2021-03-02 09:29:45,gpf8vyp,Fr,1,0,1614698985,420yolotrillswag,
8,2021-03-02 09:29:46,gpf8w2i,Holding RKT and GME. Suck on that Mods,1,0,1614698986,gswblu3-1lead,
9,2021-03-02 09:29:55,gpf8wqh,RKT TO THE MOON!!! 🚀🚀🌚🌚,1,0,1614698995,Young0Ice,


In [16]:
comments_df_sample = comments_df.loc[1:10, ['comment_id', 'text', 'timestamp']]
posts_df_sample = posts_df.loc[1:10, ['post_id', 'text', 'timestamp']]

In [1]:
!pip3 install -U spacy

Collecting spacy
  Downloading spacy-3.0.5-cp38-cp38-manylinux2014_x86_64.whl (12.9 MB)
[K     |████████████████████████████████| 12.9 MB 14.8 MB/s eta 0:00:01
[?25hCollecting pathy>=0.3.5
  Downloading pathy-0.4.0-py3-none-any.whl (36 kB)
Collecting thinc<8.1.0,>=8.0.2
  Downloading thinc-8.0.2-cp38-cp38-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 108.3 MB/s eta 0:00:01
[?25hCollecting catalogue<2.1.0,>=2.0.1
  Downloading catalogue-2.0.1-py3-none-any.whl (9.6 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp38-cp38-manylinux2014_x86_64.whl (35 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.3-cp38-cp38-manylinux2014_x86_64.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 85.7 MB/s eta 0:00:01
Collecting murmurhash<1.1.0,>=0.28.0
  D

In [2]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 18.1 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
!python3 -m spacy download en_core_web_trf

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
comments_df_sample['tokens'] = comments_df_sample['text'].apply(lambda x: process(x, nlp))

In [None]:
comments_df_sample.head(10)