In [1]:
import requests
import pandas as pd
import math
import data_reader
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import numpy as np

tqdm.pandas()
stemmer = PorterStemmer()
lemmatizer=WordNetLemmatizer()

  from pandas import Panel


In [2]:
FILE_DIR = "./custom_lexicons/sentidd/"

## Read in Directional Words

In [3]:
DIRECTIONAL_WORDS_FILEPATH = FILE_DIR+"directional_words.csv"
directional_words_df = pd.read_csv(DIRECTIONAL_WORDS_FILEPATH)
directional_words_df

Unnamed: 0,token,label,stemmed
0,accelerate,up,acceler
1,advance,up,advanc
2,award,up,award
3,better,up,better
4,climb,up,climb
5,double,up,doubl
6,faster,up,faster
7,gain,up,gain
8,grow,up,grow
9,higher,up,higher


## Helper Functions

In [4]:
def assign_direction_dependency_type(text, label):
    tokens = word_tokenize(text)
    up_cnt, down_cnt = 0, 0
    for token in tokens:
        if stemmer.stem(token) in directional_words_df[directional_words_df['label']=='up'].stemmed.values:
            up_cnt += 1
        if stemmer.stem(token) in directional_words_df[directional_words_df['label']=='down'].stemmed.values:
            down_cnt += 1
    score = up_cnt - down_cnt
    if (score > 0 and label == 'positive') or (score < 0 and label == 'negative'): return 'proportional'
    if (score > 0 and label == 'negative') or (score < 0 and label == 'positive'): return 'inversely_proportional'
    
def get_preprocessed_nouns(text):
    words = word_tokenize(text)
    nouns = [token for token, tag in pos_tag(words) if tag in ['NN', 'NNS', 'NNP', 'NNPS']]
    nouns = [lemmatizer.lemmatize(token) for token in nouns if len(token)>1]
    return np.array(nouns)

def select_frequent_tokens(list_of_tokens, min_count):
    vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, min_df=min_count)
    vectorizer.fit_transform(list_of_tokens)
    selected_tokens = vectorizer.vocabulary_.keys()
    selected_tokens = [item for item in selected_tokens if stemmer.stem(item) not in directional_words_df.stemmed.values]
    return selected_tokens

def count_sentences_containing(list_of_tokens, word):
    count = 0
    for tokens in list_of_tokens:
        if word in tokens:
            count += 1
    return count

def count_sentences_not_containing(list_of_tokens, word):
    count = 0
    for tokens in list_of_tokens:
        if word not in tokens:
            count += 1
    return count

def pmi(df, word, t):
    n = len(df)
    a = count_sentences_containing(df[df.direction_dependency==t].nouns, word)
    b = count_sentences_containing(df[df.direction_dependency!=t].nouns, word)
    c = count_sentences_not_containing(df[df.direction_dependency==t].nouns, word)
    return (n*a)/((a+b)*(a+c))

def pmi_combined(df, word):
    pmi_prop = pmi(df, word, 'proportional')
    pmi_invprop = pmi(df, word, 'inversely_proportional')
    if pmi_prop > pmi_invprop: return abs(pmi_prop)
    elif pmi_prop < pmi_invprop: return -abs(pmi_invprop)
    else: return 0
    
def extract_token_score(tokens, scores, t):
    if t == 'proportional':
        if np.max(scores) <= 0: return (None, None)
        return (tokens[np.argmax(scores)], np.max(scores))
    elif t == 'inversely_proportional':
        if np.max(scores) >= 0: return (None, None)
        return (tokens[np.argmin(scores)], np.min(scores))
    
def construct_senti_dd(train_df, dd_filepath, senti_dd_filepath):
    df = train_df
    print("Number of headlines = {}".format(len(df)))
        
    df['direction_dependency'] = df.progress_apply(lambda x: assign_direction_dependency_type(x['headline'], x['label']), axis=1)
    df.dropna(subset=['direction_dependency'], inplace=True)

    df['nouns'] = df['headline'].progress_apply(lambda x: get_preprocessed_nouns(x))
    selected_tokens = select_frequent_tokens(df.nouns.values, 6)
    print('\n\nSelected tokens ({})\n{}'.format(len(selected_tokens), selected_tokens))
    
    df['nouns'] = df['nouns'].progress_apply(lambda nouns: [token for token in nouns if token in selected_tokens])
    df = df[df['nouns'].map(len) != 0]
    print('\n\nNumber of proportional type headlines = {}\t inversely_proportional type headlines = {}'.format(len(df[df.direction_dependency=='proportional']), len(df[df.direction_dependency=='inversely_proportional'])))
    
    # Estimation of the Strength of Association Between a Word and a Direction-DependencyType
    lexicon_df = pd.DataFrame({'token': selected_tokens})
    lexicon_df['pmi'] = lexicon_df['token'].progress_apply(lambda x: pmi_combined(df, x))
    
    # Extraction of Direction-dependent Words
    df['scores'] = df['nouns'].progress_apply(lambda x: np.array([lexicon_df[lexicon_df.token==token].iloc[0].pmi for token in x]))

    df['token_score'] = df.progress_apply(lambda x: extract_token_score(x['nouns'], x['scores'], x['direction_dependency']), axis=1)
    df['entity'] = df['token_score'].progress_apply(lambda x: x[0])
    df['entity_score'] = df['token_score'].progress_apply(lambda x: x[1])
    df.drop(columns=['token_score'], inplace=True)

    dd = pd.DataFrame.from_records(
        list(zip(df.direction_dependency.values, df.entity.values, df.entity_score.values)),
                                   columns=['direction_dependency', 'entity', 'score'])
    dd.dropna(subset=['score'], inplace=True)
    dd.drop_duplicates(subset=['direction_dependency', 'entity'], inplace=True)

    proportional_words = dd[dd.direction_dependency=='proportional'].entity.values
    inversely_proportional_words = dd[dd.direction_dependency=='inversely_proportional'].entity.values
    print('\n\nProportional type entities ({})\n{}'.format(len(proportional_words), ', '.join(proportional_words)))
    print('\nInversely proportional type entities ({})\n{}'.format(len(inversely_proportional_words), 
                                                                   ', '.join(inversely_proportional_words)))
    
    dd.to_csv(dd_filepath, index=False)
    print('Created', dd_filepath)
    
    # Senti-DD Construction based on the List of Directional Words and the Direction-dependent Words
    up_words = directional_words_df[directional_words_df.label=='up'].stemmed.values
    down_words = directional_words_df[directional_words_df.label=='down'].stemmed.values

    records = []
    records.extend([('positive', entity, direction) for entity in proportional_words for direction in up_words])
    records.extend([('positive', entity, direction) for entity in inversely_proportional_words for direction in down_words])
    records.extend([('negative', entity, direction) for entity in proportional_words for direction in down_words])
    records.extend([('negative', entity, direction) for entity in inversely_proportional_words for direction in up_words])
    senti_dd = pd.DataFrame.from_records(records, columns=['sentiment', 'entity', 'directional_word'])
    print('\n\nNumber of positive pairs: {}\t negative pairs: {}'.format(len(senti_dd[senti_dd.sentiment=='positive']), len(senti_dd[senti_dd.sentiment=='negative'])))
    
    senti_dd.to_csv(senti_dd_filepath, index=False)
    print('Created', senti_dd_filepath)

## 1. Sentidd using data4

In [5]:
data4 = data_reader.read_data4("lexicon", True)
data4

Unnamed: 0,Sentiment,User_id,Message,Date,Time,Symbol
0,Bearish,442859953,crypto is even worthless if there is no world,2022-03-10,08:39:18Z,SPY
1,Bearish,442859939,why is it crashing so fast anyone know,2022-03-10,08:39:09Z,SPY
2,Bearish,442859938,bulls got trapped again how can it be l m f a o,2022-03-10,08:39:08Z,SPY
3,Bearish,442859889,by friday cmon make me rich daddy,2022-03-10,08:38:22Z,SPY
4,Bearish,442859884,algo last night predicts,2022-03-10,08:38:19Z,SPY
...,...,...,...,...,...,...
680928,Bullish,235142330,bullish rsi,2020-08-10,18:46:14Z,SOL.X
680929,Bullish,234896323,time to back up the truck and load up,2020-08-09,22:37:22Z,SOL.X
680930,Bullish,231400354,this things going to pump harder than arnold s...,2020-07-28,01:25:14Z,SOL.X
680931,Bullish,231399519,daddy wants gains,2020-07-28,01:19:47Z,SOL.X


In [6]:
data4 = data4[['Message', 'Sentiment']]
data4.columns = ['headline', 'label']
data4['label'] = data4['label'].apply(lambda x: "negative" if x=="Bearish" else "positive")
data4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,headline,label
0,crypto is even worthless if there is no world,negative
1,why is it crashing so fast anyone know,negative
2,bulls got trapped again how can it be l m f a o,negative
3,by friday cmon make me rich daddy,negative
4,algo last night predicts,negative
...,...,...
680928,bullish rsi,positive
680929,time to back up the truck and load up,positive
680930,this things going to pump harder than arnold s...,positive
680931,daddy wants gains,positive


In [7]:
dd_filepath_data4 = (FILE_DIR+'direction_dependent_entities_data4.csv')
senti_dd_filepath_data4 = (FILE_DIR+'sentidd_data4.csv')
construct_senti_dd(data4, dd_filepath_data4, senti_dd_filepath_data4)

  0%|                                                                            | 19/680933 [00:00<2:05:17, 90.57it/s]

Number of headlines = 680933


100%|████████████████████████████████████████████████████████████████████████| 680933/680933 [2:29:44<00:00, 75.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|█████████████████████████████████████████████████████████████████████████| 119740/119740 [04:50<00:00, 412.02it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  3%|█▉                     



Selected tokens (5777)


100%|███████████████████████████████████████████████████████████████████████| 119740/119740 [00:11<00:00, 10088.83it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  0%|                                                                                 | 2/5777 [00:00<07:56, 12.12it/s]



Number of proportional type headlines = 76638	 inversely_proportional type headlines = 34731


100%|██████████████████████████████████████████████████████████████████████████████| 5777/5777 [16:01<00:00,  6.01it/s]
100%|█████████████████████████████████████████████████████████████████████████| 111369/111369 [12:59<00:00, 142.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|███████████████████████████████████████████████████████████████████████| 111369/111369 [00:06<00:00, 17626.63it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████████████████████████████████████████████████████████████████| 111369/111369 [00:00<00:00, 989779.28



Proportional type entities (2451)
bitcoin, future, gg, baby, liftoff, spx, low, sector, funding, short, morning, train, exchange, miss, push, nikkei, tomorrow, ceo, podcast, perfect, age, eco, town, algo, till, technical, gap, track, qa, cap, time, beginning, limit, moon, signal, chart, glimpse, congrats, ripper, tmr, watch, robinhood, ticker, back, alongside, tommorow, station, momentum, swing, amazon, theyll, release, leg, monster, confident, shitter, analyst, super, regardless, get, thankful, tonight, pattern, cc, share, ema, farm, covering, reversal, closing, resolution, let, high, consolidation, hold, revenue, pullback, follow, contract, splash, stop, rare, profit, play, keep, metaverse, confidence, mastercard, exhaustion, dip, jan, bandwagon, bro, tuesday, happens, hour, digit, target, extension, launch, daddy, th, elliott, reuters, dax, feeling, tute, nonsense, oi, volume, run, basement, bottom, move, earthquake, bb, partnership, faa, delivery, date, opportunity, app, itll, re

Created ./custom_lexicons/sentidd/sentidd_data4.csv


## 2. Sentidd using data1

In [8]:
data1 = data_reader.read_data1("dataframe", True)
data1

Unnamed: 0,text_cleaned,Label
0,downgrades stocks investing tradeideas,0
1,looking sexy this morning break on volume and ...,1
2,stock hasnt moved much since first few weeks a...,1
3,whole foods may feel price competition but wil...,1
4,apples iphone se could be doing better than ex...,1
...,...,...
2018,hold through all the media bs and you will be ...,1
2019,i love blood like a vampirekeep bleeding for me,0
2020,yea dude airplanes are still flying,1
2021,ten year yield weekly year yield still very be...,1


In [9]:
data1.columns = ['headline', 'label']
data1['label'] = data1['label'].apply(lambda x: "negative" if x==0 else "positive")
data1

Unnamed: 0,headline,label
0,downgrades stocks investing tradeideas,negative
1,looking sexy this morning break on volume and ...,positive
2,stock hasnt moved much since first few weeks a...,positive
3,whole foods may feel price competition but wil...,positive
4,apples iphone se could be doing better than ex...,positive
...,...,...
2018,hold through all the media bs and you will be ...,positive
2019,i love blood like a vampirekeep bleeding for me,negative
2020,yea dude airplanes are still flying,positive
2021,ten year yield weekly year yield still very be...,positive


In [10]:
dd_filepath_data1 = (FILE_DIR+'direction_dependent_entities_data1.csv')
senti_dd_filepath_data1 = (FILE_DIR+'sentidd_data1.csv')
construct_senti_dd(data1, dd_filepath_data1, senti_dd_filepath_data1)

  1%|▊                                                                               | 20/2023 [00:00<00:21, 92.39it/s]

Number of headlines = 2023


100%|█████████████████████████████████████████████████████████████████████████████| 2023/2023 [00:19<00:00, 102.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 368/368 [00:00<00:00, 770.85it/s]
100%|████████████████████████████████████████████████████████████████████████████| 368/368 [00:00<00:00, 183925.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 389.46it/s]
  0%|                                                                                          | 0/234 [00:00<?, ?it/s]



Selected tokens (37)
['apple', 'stock', 'week', 'google', 'look', 'sale', 'weekend', 'call', 'morning', 'afternoon', 'trade', 'intel', 'point', 'today', 'earnings', 'market', 'day', 'low', 'dont', 'longs', 'buying', 'move', 'watch', 'chart', 'share', 'oil', 'price', 'year', 'time', 'china', 'gap', 'support', 'deal', 'metal', 'starbucks', 'report', 'sign']


Number of proportional type headlines = 179	 inversely_proportional type headlines = 55


100%|██████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 1004.86it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|█████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 17992.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 225624.63it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value



Proportional type entities (22)
stock, look, weekend, call, afternoon, earnings, market, dont, longs, watch, chart, share, oil, year, week, time, support, trade, deal, intel, point, price

Inversely proportional type entities (8)
google, sale, day, china, gap, metal, starbucks, today
Created ./custom_lexicons/sentidd/direction_dependent_entities_data1.csv


Number of positive pairs: 528	 negative pairs: 402
Created ./custom_lexicons/sentidd/sentidd_data1.csv



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
