In [None]:
# import relevant libraries
import gc
import pandas as pd
from utilities import *
from config import *
import os
from collections import Counter

nlp = spacy.load("en_core_web_trf")
print("Spacy model loaded")

def ner_title_extract_orgs(title): 
    doc = nlp(title)
    return [ent.text.lower() for ent in doc.ents if ent.label_ == 'ORG']

base_dir = os.getcwd()+'/data'
all_items = os.listdir(base_dir)
folders_only = [item for item in all_items if os.path.isdir(os.path.join(base_dir, item))]
year_string, month_string, day_string = time.strftime('%Y/%m/%d').split('/')

variables = GetEnvironmentVariables()
openai.api_key = variables["OPENAI_API_KEY"]
print("OpenAI API key loaded")

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    top_k=None
)
print("Distilled student sentiment classifier loaded")

print("reading all_stock_symbols.csv")
all_symbols = pd.read_csv('data/all_stock_symbols.csv')

for folder in folders_only:
    for str in ['hot', 'top']:
        s_file = f"{base_dir}/{folder}/{str}/year={year_string}/month={month_string}/day={day_string}/posts.csv"
        c_file = f"{base_dir}/{folder}/{str}/year={year_string}/month={month_string}/day={day_string}/comments.csv"
        u_file = f"{base_dir}/{folder}/{str}/year={year_string}/month={month_string}/day={day_string}/sentiment.csv"

        if os.path.exists(u_file):
            print(f"sentiment file already exists: {u_file}")
            continue

        print("stocks file = ", s_file)
        print("comments file = ", c_file)
        print("sentiment file = ", u_file)
        print("--------------------------------creating sentiment file for: ", folder, " --------------------------------")

        stocks_posts = pd.read_csv(s_file)
        stocks_comments = pd.read_csv(c_file)

        print("enriching dataframe with AI and NLP")
        stocks_posts.drop(columns=['url', 'subreddit', 'author'], inplace=True)
        stocks_posts['matching_symbols'] = stocks_posts['title'].apply(lambda x: find_stock_symbols_in_title(x, all_symbols['Symbol']))
        stocks_posts['matching_orgs_spacy'] = stocks_posts['title'].apply(lambda x: ner_title_extract_orgs(x))
        stocks_posts['matching_symbols_ai'] = stocks_posts['title'].apply(lambda x: query_openai_api(x, variables["OPENAI_API_KEY"]))
        stocks_posts['matching_orgs_nltk'] = stocks_posts['title'].apply(lambda x: nltk_extract_symbols(x))
        stocks_posts['final_symbols'] = stocks_posts.apply(lambda row: extract_symbols_from_df_row(row, all_symbols), axis=1)
        verify_posts_and_comments_unique(stocks_posts, stocks_comments)

        print("Getting sentiment score")
        stocks_posts['pos_neu_neg'] = stocks_posts.apply(lambda row: get_sentiment_score(row, stocks_comments, distilled_student_sentiment_classifier), axis=1)
        verify_posts_and_comments_unique(stocks_posts, stocks_comments)

        stocks_posts.head(10)

        symb_sent = stocks_posts[['final_symbols', 'pos_neu_neg']]

        counter = {}
        counter['total'] = {'counter': 0,
                    'pos_tag': 0,
                    'neu_tag': 0,
                    'neg_tag': 0}

        for _, row in symb_sent.iterrows():
        # Skip None or NaN values
            counter['total']['counter'] += 1
            counter['total']['pos_tag'] += row['pos_neu_neg'][0]
            counter['total']['neu_tag'] += row['pos_neu_neg'][1]
            counter['total']['neg_tag'] += row['pos_neu_neg'][2]

            if len(row['final_symbols']) <= 3:
                for symbol in row['final_symbols']:
                    if symbol not in counter:
                        counter[symbol] = {"counter": 1,
                                        'pos_tag': row['pos_neu_neg'][0],
                                        'neu_tag': row['pos_neu_neg'][1],
                                        'neg_tag': row['pos_neu_neg'][2]
                                        }
                    else:
                        counter[symbol]['counter'] += 1
                        counter[symbol]['pos_tag'] += row['pos_neu_neg'][0]
                        counter[symbol]['neu_tag'] += row['pos_neu_neg'][1]
                        counter[symbol]['neg_tag'] += row['pos_neu_neg'][2]

        df = pd.DataFrame(counter)
        df_t = df.transpose()
        df_t.sort_values(by='counter', ascending=False, inplace=True)

        print("df_t.to_csv(u_file, index=True)")
        df_t.to_csv(u_file, index=True)

        print("deleting dataframes")
        del [stocks_posts, stocks_comments, symb_sent, counter, df, df_t]
        gc.collect()


In [None]:
dfs = []

for folder in folders_only:
    for str in ['hot', 'top']:
        u_file = f"{base_dir}/{folder}/{str}/year={year_string}/month={month_string}/day={day_string}/sentiment.csv"
        print(u_file)

        if os.path.exists(u_file):
            df = pd.read_csv(u_file)
            dfs.append(df)
            
            

In [3]:
for df in dfs:
   df.rename(columns={ df.columns[0]: "symbol" }, inplace = True)

combined_df = pd.concat(dfs).groupby('symbol', as_index=False).sum()
combined_df.sort_values(by='pos_tag', ascending=False, inplace=True)
combined_df.to_csv(f'data/combined_df_year={year_string}_month={month_string}_day={day_string}.csv', index=False)