In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy

nlp = spacy.load("en_core_web_sm")

from transformers import BertTokenizer


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv("../data/rss_fin.csv")
data.head(10)

Unnamed: 0,_id,channel,title,text,date
0,658018b5112383a507ac9074,WSJ.com: Markets,It's the Magnificent Seven's Market. The Other...,"Apple, Microsoft, Alphabet, Amazon, Nvidia, Te...","Sun, 17 Dec 2023 07:00:00 -0500"
1,658018b5112383a507ac9075,WSJ.com: Markets,"Buy Now, Pay Later Keeps People Spending---Wit...",Consumers are flocking to installment loans fo...,"Sun, 17 Dec 2023 07:00:00 -0500"
2,658018b5112383a507ac9076,WSJ.com: Markets,How to Make the Most of Your FSA Money Before ...,Many workers take advantage of the tax-free fl...,"Sat, 16 Dec 2023 21:00:00 -0500"
3,658018b5112383a507ac9077,WSJ.com: Markets,Why It's Taking So Long for Americans to Get P...,Hundreds of banks use Fed’s new instant-paymen...,"Sat, 16 Dec 2023 10:00:00 -0500"
4,658018b5112383a507ac9078,WSJ.com: Markets,Robinhood Woos Wealthier Clients From Bigger B...,"Known for a clientele of first-time investors,...","Sat, 16 Dec 2023 10:00:00 -0500"
5,658018b5112383a507ac9079,WSJ.com: Markets,Tesla's Self-Driving Tech Has Competition,Gradually improving driver-assistance features...,"Sat, 16 Dec 2023 10:00:00 -0500"
6,658018b5112383a507ac907a,WSJ.com: Markets,"The Score: Macy's, Hasbro, Pfizer and More Sto...",Here are some of the major companies whose sto...,"Fri, 15 Dec 2023 18:16:00 -0500"
7,658018b5112383a507ac907b,WSJ.com: Markets,Beware the Most Crowded Trade on Wall Street: ...,Each of the past three years had a similarly s...,"Fri, 15 Dec 2023 16:54:00 -0500"
8,658018b5112383a507ac907c,WSJ.com: Markets,Visa Agrees to Acquire Majority Interest in Pa...,Visa entered into an agreement to acquire a ma...,"Fri, 15 Dec 2023 16:36:00 -0500"
9,658018b6112383a507ac907d,WSJ.com: Markets,Dow Notches Another Record Close,The blue-chip index notched its third straight...,"Fri, 15 Dec 2023 16:33:00 -0500"


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   _id      615 non-null    object
 1   channel  605 non-null    object
 2   title    615 non-null    object
 3   text     604 non-null    object
 4   date     615 non-null    object
dtypes: object(5)
memory usage: 24.2+ KB


### Data Cleaning, Tokenization & Lemmatization

In [3]:
def process_text(text):
    # Apply spaCy to tokenize and lemmatize the text
    doc = nlp(text.lower())
    
    # Extract tokens that are not stop words and are not punctuations
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Join the tokens back into a string
    processed_text = " ".join(tokens)
    
    return processed_text


In [8]:
data.dropna(subset=['title', 'text'], inplace=True)
data.drop(columns = ["_id"])

Unnamed: 0,channel,title,text,date,text_vectors,title_vectors
0,WSJ.com: Markets,magnificent seven market stock live,apple microsoft alphabet amazon nvidia tesla m...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.18625039, -0.8668952, 0.046901673, 1.1034...","[[-0.9848044, -0.40753686, -0.089207575, 0.038..."
1,WSJ.com: Markets,buy pay later people spend credit agency know,consumer flock installment loan holiday gift g...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.11025052, -0.30337343, 0.53082794, -0.281...","[[-0.68066454, -1.340677, 1.1544002, 0.0812540..."
2,WSJ.com: Markets,fsa money disappear,worker advantage tax free flexible spend accou...,"Sat, 16 Dec 2023 21:00:00 -0500","[[-0.5689461, -0.6188227, -0.029612377, -0.167...","[[0.22389163, -1.1406136, 0.12869072, 0.897906..."
3,WSJ.com: Markets,long americans payment instantly,bank use feed new instant payment service univ...,"Sat, 16 Dec 2023 10:00:00 -0500","[[-0.57085145, -1.6188182, 0.13059917, 0.21096...","[[-0.033791594, -1.0400614, 0.21256757, 1.9177..."
4,WSJ.com: Markets,robinhood woo wealthy client big brokerage,know clientele time investor trading app recei...,"Sat, 16 Dec 2023 10:00:00 -0500","[[0.33517328, -2.3876314, 0.8566309, 0.6664783...","[[-0.76147354, -0.9745778, 0.6955986, 0.359618..."
...,...,...,...,...,...,...
610,Wealthy Retirement,spirit airline clear takeoff,< p > obstacle 139 upside < /p>\n < p > post <...,"Fri, 08 Dec 2023 21:30:54 +0000","[[-1.8001248, 0.2740277, 0.9226156, -1.2874559...","[[-0.5583966, -1.0671698, -0.26011604, 1.11844..."
611,Wealthy Retirement,arbor realty trust slash 12.7 dividend yield,"< p > cut dividend < /p>\n < p > post < href=""...","Wed, 06 Dec 2023 21:30:34 +0000","[[-1.7282808, 0.28006566, 0.8530735, -1.280369...","[[-0.9263734, -0.41342488, 1.2810317, 1.150439..."
612,Wealthy Retirement,real truth interest rate,< p > need crystal ball this?</p>\n < p > post...,"Tue, 05 Dec 2023 21:30:58 +0000","[[-1.7835644, 0.28623873, 0.86563694, -1.27627...","[[-0.0993972, -0.98015046, -0.39904487, 0.5777..."
613,Wealthy Retirement,federal reserve aspire superhero,< p > inflation kryptonite.</p>\n < p > post <...,"Sat, 02 Dec 2023 16:30:56 +0000","[[-1.7856946, 0.24719714, 0.92206454, -1.32821...","[[-0.2278733, -1.044591, 0.4025926, 0.09607232..."


In [9]:
# Apply the process_text function to the 'text' and 'title' columns
data['text'] = data['text'].apply(process_text)
data['title'] = data['title'].apply(process_text)

data.head()


Unnamed: 0,_id,channel,title,text,date,text_vectors,title_vectors
0,658018b5112383a507ac9074,WSJ.com: Markets,magnificent seven market stock live,apple microsoft alphabet amazon nvidia tesla m...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.18625039, -0.8668952, 0.046901673, 1.1034...","[[-0.9848044, -0.40753686, -0.089207575, 0.038..."
1,658018b5112383a507ac9075,WSJ.com: Markets,buy pay late people spend credit agency know,consumer flock installment loan holiday gift g...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.11025052, -0.30337343, 0.53082794, -0.281...","[[-0.68066454, -1.340677, 1.1544002, 0.0812540..."
2,658018b5112383a507ac9076,WSJ.com: Markets,fsa money disappear,worker advantage tax free flexible spend accou...,"Sat, 16 Dec 2023 21:00:00 -0500","[[-0.5689461, -0.6188227, -0.029612377, -0.167...","[[0.22389163, -1.1406136, 0.12869072, 0.897906..."
3,658018b5112383a507ac9077,WSJ.com: Markets,long americans payment instantly,bank use feed new instant payment service univ...,"Sat, 16 Dec 2023 10:00:00 -0500","[[-0.57085145, -1.6188182, 0.13059917, 0.21096...","[[-0.033791594, -1.0400614, 0.21256757, 1.9177..."
4,658018b5112383a507ac9078,WSJ.com: Markets,robinhood woo wealthy client big brokerage,know clientele time investor trading app recei...,"Sat, 16 Dec 2023 10:00:00 -0500","[[0.33517328, -2.3876314, 0.8566309, 0.6664783...","[[-0.76147354, -0.9745778, 0.6955986, 0.359618..."


In [10]:
def get_glove_vectors(text):
    # Apply spaCy to tokenize and get GloVe word vectors
    doc = nlp(text)
    
    # Extract the GloVe vectors for each token in the text
    vectors = [token.vector for token in doc]
    
    return vectors

In [11]:
# Apply the process_text and get_glove_vectors functions to the 'text' and 'title' columns
data['text'] = data['text'].apply(process_text)
data['title'] = data['title'].apply(process_text)
data['text_vectors'] = data['text'].apply(get_glove_vectors)
data['title_vectors'] = data['title'].apply(get_glove_vectors)

# Display the processed DataFrame with GloVe word vectors
data.head()


Unnamed: 0,_id,channel,title,text,date,text_vectors,title_vectors
0,658018b5112383a507ac9074,WSJ.com: Markets,magnificent seven market stock live,apple microsoft alphabet amazon nvidia tesla m...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.18625039, -0.8668952, 0.046901673, 1.1034...","[[-0.9848044, -0.40753686, -0.089207575, 0.038..."
1,658018b5112383a507ac9075,WSJ.com: Markets,buy pay late people spend credit agency know,consumer flock installment loan holiday gift g...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.11025052, -0.30337343, 0.53082794, -0.281...","[[-0.84627914, -1.3933793, 1.4022795, 0.190420..."
2,658018b5112383a507ac9076,WSJ.com: Markets,fsa money disappear,worker advantage tax free flexible spend accou...,"Sat, 16 Dec 2023 21:00:00 -0500","[[-0.5689461, -0.6188227, -0.029612377, -0.167...","[[0.22389163, -1.1406136, 0.12869072, 0.897906..."
3,658018b5112383a507ac9077,WSJ.com: Markets,long americans payment instantly,bank use feed new instant payment service univ...,"Sat, 16 Dec 2023 10:00:00 -0500","[[-0.57085145, -1.6188182, 0.13059917, 0.21096...","[[-0.033791594, -1.0400614, 0.21256757, 1.9177..."
4,658018b5112383a507ac9078,WSJ.com: Markets,robinhood woo wealthy client big brokerage,know clientele time investor trading app recei...,"Sat, 16 Dec 2023 10:00:00 -0500","[[0.33517328, -2.3876314, 0.8566309, 0.6664783...","[[-0.76147354, -0.9745778, 0.6955986, 0.359618..."


In [40]:
data.channel.unique()

array(['WSJ.com: Markets',
       '"when:24h allinurl:bloomberg.com" - Google News',
       'Bloomberg Markets', 'rss_articles', 'Finance', 'Money',
       'Moneywatch - CBSNews.com', 'Morning Money', 'Kiplinger RSS Feed',
       'Money.com', 'Investing Archives - 24/7 Wall St.',
       'naked capitalism', nan, 'STOREYS', 'ThinkAdvisor',
       'InvestmentNews', 'Daily Reckoning', 'Wealthy Retirement'],
      dtype=object)

### Sentiment Analisys

In [25]:
!pip install nltk
import nltk
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import sentiwordnet as swn




[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [22]:
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn

# Function to get the WordNet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None

# Function to calculate sentiment score
def calculate_sentiment(text):
    sentiment = 0.0
    tokens_count = 0

    tagged = pos_tag(word_tokenize(text))
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            continue

        lemma = wn.morphy(word, wn_tag)
        if not lemma:
            continue

        synsets = wn.synsets(lemma, pos=wn_tag)
        if not synsets:
            continue

        # Take the first sense, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())

        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1

    # final score is sentiment over number of tokens
    return sentiment / float(tokens_count) if tokens_count else 0


In [32]:
# Example application on a DataFrame
data['sentiment_score'] = data['text'].apply(calculate_sentiment)

def classify_sentiment(score):
    if score > 0.1:  # Threshold for positive
        return 'positive'
    elif score < -0.1:  # Threshold for negative
        return 'negative'
    else:
        return 'neutral'

data['sentiment_classified'] = data['sentiment_score'].apply(classify_sentiment)

In [35]:
data.sort_values("sentiment_score", ascending=False)

Unnamed: 0,_id,channel,title,text,date,text_vectors,title_vectors,sentiment_score,sentiment_classified
370,6580252ec7b44d3dfb1c50e9,Money.com,taxis 2024 deadline need know,april 15 important date calendar,"Fri, 15 Dec 2023 13:00:02 +0000","[[-0.21541336, -1.7993135, 0.17370173, -0.0459...","[[0.026498318, 0.25080687, -0.297739, -0.19767...",0.333333,positive
583,658025aec7b44d3dfb1c51be,InvestmentNews,apollo hire blackrock veteran wealth insurance...,firm target $ 1 trillion asset 2026,"Wed, 13 Dec 2023 11:29:32 +0000","[[-1.2290314, -1.5181934, 0.043434933, 0.35334...","[[-1.0660863, -1.7727864, 0.6831285, 1.3467833...",0.312500,positive
271,6580239f112383a507ac9183,Money,1 thing need start holiday shopping year,expert break good way establish maintain healt...,"Tue, 28 Nov 2023 05:45:01 -0500","[[-0.3288595, -1.242538, 0.17824739, 0.3876224...","[[-0.12927133, 0.0023391359, 0.08604336, 1.971...",0.263889,positive
431,6580255dc7b44d3dfb1c5126,naked capitalism,luis caputo argentina new economy minister eco...,meet new boss old,"Fri, 15 Dec 2023 11:45:52 +0000","[[-1.3317202, -0.15734085, 0.25999215, -0.1757...","[[-0.8523917, -0.69491744, 0.32422942, 1.06377...",0.250000,positive
558,658025adc7b44d3dfb1c51a5,InvestmentNews,osaic nabs lincoln financial advisor late purc...,"1,450 financial advisor involve transaction ov...","Thu, 14 Dec 2023 21:50:18 +0000","[[-0.87279296, -0.17041542, 0.24818338, 3.5717...","[[-0.7919728, -1.3345959, -0.37617624, 1.40818...",0.225000,positive
...,...,...,...,...,...,...,...,...,...
287,6580239f112383a507ac9193,Money,rude thing casino,etiquette expert share faux pas sadly common c...,"Thu, 21 Sep 2023 09:35:10 -0400","[[-0.7381811, -1.4054437, 0.72635067, 0.451048...","[[-0.73391783, -0.9697807, -0.117076814, -0.05...",-0.109375,negative
246,65802286112383a507ac916a,Finance,expect package holiday fall victim delivery scam,federal trade commission warn criminal send fa...,"Fri, 15 Dec 2023 17:21:59 GMT","[[-0.110203594, -1.5315222, 0.3821449, 0.34219...","[[-0.39115924, -1.2458448, 0.8382292, -0.10024...",-0.113636,negative
285,6580239f112383a507ac9191,Money,hotel upgrade worth money,regret splurge perk stay,"Thu, 05 Oct 2023 05:45:12 -0400","[[-0.64693874, -1.4254861, 0.53602815, 0.82520...","[[-0.5932593, -0.25979692, 0.25029492, -0.0310...",-0.125000,negative
131,65801f16112383a507ac90f7,rss_articles,etfs vs. mutual fund young investor,choose etfs mutual fund tough young investor t...,2023-12-17T20:56:30+00:00,"[[-1.550796, -0.23861393, 0.38049823, 0.929769...","[[-1.1974895, 0.18208346, 0.5071481, 0.2954434...",-0.125000,negative


In [37]:
data = data.drop(columns = "_id")

KeyError: "['_id'] not found in axis"

In [38]:
data.head()

Unnamed: 0,channel,title,text,date,text_vectors,title_vectors,sentiment_score,sentiment_classified
0,WSJ.com: Markets,magnificent seven market stock live,apple microsoft alphabet amazon nvidia tesla m...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.18625039, -0.8668952, 0.046901673, 1.1034...","[[-0.9848044, -0.40753686, -0.089207575, 0.038...",0.013889,neutral
1,WSJ.com: Markets,buy pay late people spend credit agency know,consumer flock installment loan holiday gift g...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.11025052, -0.30337343, 0.53082794, -0.281...","[[-0.84627914, -1.3933793, 1.4022795, 0.190420...",0.025,neutral
2,WSJ.com: Markets,fsa money disappear,worker advantage tax free flexible spend accou...,"Sat, 16 Dec 2023 21:00:00 -0500","[[-0.5689461, -0.6188227, -0.029612377, -0.167...","[[0.22389163, -1.1406136, 0.12869072, 0.897906...",0.1875,positive
3,WSJ.com: Markets,long americans payment instantly,bank use feed new instant payment service univ...,"Sat, 16 Dec 2023 10:00:00 -0500","[[-0.57085145, -1.6188182, 0.13059917, 0.21096...","[[-0.033791594, -1.0400614, 0.21256757, 1.9177...",0.075,neutral
4,WSJ.com: Markets,robinhood woo wealthy client big brokerage,know clientele time investor trading app recei...,"Sat, 16 Dec 2023 10:00:00 -0500","[[0.33517328, -2.3876314, 0.8566309, 0.6664783...","[[-0.76147354, -0.9745778, 0.6955986, 0.359618...",0.0,neutral
