In [2]:
import pandas as pd
import numpy as np
import nltk
import spacy

nlp = spacy.load("en_core_web_sm")

from transformers import BertTokenizer


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv("../data/rss_fin.csv")
data.head(10)

Unnamed: 0,_id,channel,title,text,date
0,658018b5112383a507ac9074,WSJ.com: Markets,It's the Magnificent Seven's Market. The Other...,"Apple, Microsoft, Alphabet, Amazon, Nvidia, Te...","Sun, 17 Dec 2023 07:00:00 -0500"
1,658018b5112383a507ac9075,WSJ.com: Markets,"Buy Now, Pay Later Keeps People Spending---Wit...",Consumers are flocking to installment loans fo...,"Sun, 17 Dec 2023 07:00:00 -0500"
2,658018b5112383a507ac9076,WSJ.com: Markets,How to Make the Most of Your FSA Money Before ...,Many workers take advantage of the tax-free fl...,"Sat, 16 Dec 2023 21:00:00 -0500"
3,658018b5112383a507ac9077,WSJ.com: Markets,Why It's Taking So Long for Americans to Get P...,Hundreds of banks use Fed’s new instant-paymen...,"Sat, 16 Dec 2023 10:00:00 -0500"
4,658018b5112383a507ac9078,WSJ.com: Markets,Robinhood Woos Wealthier Clients From Bigger B...,"Known for a clientele of first-time investors,...","Sat, 16 Dec 2023 10:00:00 -0500"
5,658018b5112383a507ac9079,WSJ.com: Markets,Tesla's Self-Driving Tech Has Competition,Gradually improving driver-assistance features...,"Sat, 16 Dec 2023 10:00:00 -0500"
6,658018b5112383a507ac907a,WSJ.com: Markets,"The Score: Macy's, Hasbro, Pfizer and More Sto...",Here are some of the major companies whose sto...,"Fri, 15 Dec 2023 18:16:00 -0500"
7,658018b5112383a507ac907b,WSJ.com: Markets,Beware the Most Crowded Trade on Wall Street: ...,Each of the past three years had a similarly s...,"Fri, 15 Dec 2023 16:54:00 -0500"
8,658018b5112383a507ac907c,WSJ.com: Markets,Visa Agrees to Acquire Majority Interest in Pa...,Visa entered into an agreement to acquire a ma...,"Fri, 15 Dec 2023 16:36:00 -0500"
9,658018b6112383a507ac907d,WSJ.com: Markets,Dow Notches Another Record Close,The blue-chip index notched its third straight...,"Fri, 15 Dec 2023 16:33:00 -0500"


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   _id      615 non-null    object
 1   channel  605 non-null    object
 2   title    615 non-null    object
 3   text     604 non-null    object
 4   date     615 non-null    object
dtypes: object(5)
memory usage: 24.2+ KB


### Data Cleaning, Tokenization & Lemmatization

In [10]:
def process_text(text):
    # Apply spaCy to tokenize and lemmatize the text
    doc = nlp(text.lower())
    
    # Extract tokens that are not stop words and are not punctuations
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Join the tokens back into a string
    processed_text = " ".join(tokens)
    
    return processed_text


In [8]:
data.dropna(subset=['title', 'text'], inplace=True)

In [11]:
# Apply the process_text function to the 'text' and 'title' columns
data['text'] = data['text'].apply(process_text)
data['title'] = data['title'].apply(process_text)

data.head()


Unnamed: 0,_id,channel,title,text,date
0,658018b5112383a507ac9074,WSJ.com: Markets,magnificent seven market stock live,apple microsoft alphabet amazon nvidia tesla m...,"Sun, 17 Dec 2023 07:00:00 -0500"
1,658018b5112383a507ac9075,WSJ.com: Markets,buy pay later people spend credit agency know,consumer flock installment loan holiday gift g...,"Sun, 17 Dec 2023 07:00:00 -0500"
2,658018b5112383a507ac9076,WSJ.com: Markets,fsa money disappear,worker advantage tax free flexible spend accou...,"Sat, 16 Dec 2023 21:00:00 -0500"
3,658018b5112383a507ac9077,WSJ.com: Markets,long americans payment instantly,bank use feed new instant payment service univ...,"Sat, 16 Dec 2023 10:00:00 -0500"
4,658018b5112383a507ac9078,WSJ.com: Markets,robinhood woo wealthy client big brokerage,know clientele time investor trading app recei...,"Sat, 16 Dec 2023 10:00:00 -0500"


In [12]:
def get_glove_vectors(text):
    # Apply spaCy to tokenize and get GloVe word vectors
    doc = nlp(text)
    
    # Extract the GloVe vectors for each token in the text
    vectors = [token.vector for token in doc]
    
    return vectors

In [14]:
# Apply the process_text and get_glove_vectors functions to the 'text' and 'title' columns
data['text'] = data['text'].apply(process_text)
data['title'] = data['title'].apply(process_text)
data['text_vectors'] = data['text'].apply(get_glove_vectors)
data['title_vectors'] = data['title'].apply(get_glove_vectors)

# Display the processed DataFrame with GloVe word vectors
data.head()


Unnamed: 0,_id,channel,title,text,date,text_vectors,title_vectors
0,658018b5112383a507ac9074,WSJ.com: Markets,magnificent seven market stock live,apple microsoft alphabet amazon nvidia tesla m...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.18625039, -0.8668952, 0.046901673, 1.1034...","[[-0.9848044, -0.40753686, -0.089207575, 0.038..."
1,658018b5112383a507ac9075,WSJ.com: Markets,buy pay late people spend credit agency know,consumer flock installment loan holiday gift g...,"Sun, 17 Dec 2023 07:00:00 -0500","[[-0.11025052, -0.30337343, 0.53082794, -0.281...","[[-0.84627914, -1.3933793, 1.4022795, 0.190420..."
2,658018b5112383a507ac9076,WSJ.com: Markets,fsa money disappear,worker advantage tax free flexible spend accou...,"Sat, 16 Dec 2023 21:00:00 -0500","[[-0.5689461, -0.6188227, -0.029612377, -0.167...","[[0.22389163, -1.1406136, 0.12869072, 0.897906..."
3,658018b5112383a507ac9077,WSJ.com: Markets,long americans payment instantly,bank use feed new instant payment service univ...,"Sat, 16 Dec 2023 10:00:00 -0500","[[-0.57085145, -1.6188182, 0.13059917, 0.21096...","[[-0.033791594, -1.0400614, 0.21256757, 1.9177..."
4,658018b5112383a507ac9078,WSJ.com: Markets,robinhood woo wealthy client big brokerage,know clientele time investor trading app recei...,"Sat, 16 Dec 2023 10:00:00 -0500","[[0.33517328, -2.3876314, 0.8566309, 0.6664783...","[[-0.76147354, -0.9745778, 0.6955986, 0.359618..."


In [11]:
def load_glove_embeddings(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
            except ValueError:
                print(f"Error converting to float: {values[1:]} for word: {word}")
                continue
    return embeddings_dict

glove_embeddings = load_glove_embeddings("../data/glove.840B.300d.txt")  # replace with your GloVe file path



TypeError: unhashable type: 'Series'

### Stop Words

In [10]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the list of stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    # Remove possessive endings
    text = text.replace("'s", "")
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words and convert to lower case
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    # Join the words back into a string
    return " ".join(filtered_text)

# Assuming 'data' is your DataFrame
# Drop rows where either 'title' or 'text' is NaN
data.dropna(subset=['title', 'text'], inplace=True)

# Apply the stop words removal
data['title'] = data['title'].apply(remove_stop_words)
data['text'] = data['text'].apply(remove_stop_words)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ismaildibirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
data.head()

Unnamed: 0,_id,channel,title,text,date,tokenized_title,tokenized_text
0,658018b5112383a507ac9074,WSJ.com: Markets,magnificent seven market . stock living .,"apple , microsoft , alphabet , amazon , nvidia...","Sun, 17 Dec 2023 07:00:00 -0500","[It, 's, the, Magnificent, Seven, 's, Market, ...","[Apple, ,, Microsoft, ,, Alphabet, ,, Amazon, ..."
1,658018b5112383a507ac9075,WSJ.com: Markets,"buy , pay later keep people spending -- -witho...",consumer flocking installment loan everything ...,"Sun, 17 Dec 2023 07:00:00 -0500","[Buy, Now, ,, Pay, Later, Keeps, People, Spend...","[Consumers, are, flocking, to, installment, lo..."
2,658018b5112383a507ac9076,WSJ.com: Markets,make fsa money disappears,many worker take advantage tax-free flexible-s...,"Sat, 16 Dec 2023 21:00:00 -0500","[How, to, Make, the, Most, of, Your, FSA, Mone...","[Many, workers, take, advantage, of, the, tax-..."
3,658018b5112383a507ac9077,WSJ.com: Markets,taking long american get payment instantly,hundred bank use fed ’ new instant-payment ser...,"Sat, 16 Dec 2023 10:00:00 -0500","[Why, It, 's, Taking, So, Long, for, Americans...","[Hundreds, of, banks, use, Fed, ’, s, new, ins..."
4,658018b5112383a507ac9078,WSJ.com: Markets,robinhood woos wealthier client bigger brokerage,"known clientele first-time investor , trading ...","Sat, 16 Dec 2023 10:00:00 -0500","[Robinhood, Woos, Wealthier, Clients, From, Bi...","[Known, for, a, clientele, of, first-time, inv..."


### Sentiment Analisys

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentCNN(nn.Module):
    def __init__(self, embedding_dim, num_filters, filter_sizes, output_dim=3, dropout=0.5):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x = [batch size, sent len, emb dim]
        x = x.permute(0, 2, 1)
        # x = [batch size, emb dim, sent len]
        x = [F.relu(conv(x)) for conv in self.convs]
        x = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.fc4(x)
        return F.softmax(x, dim=1)

# Example usage
model = SentimentCNN(embedding_dim=100, num_filters=100, filter_sizes=[3, 4, 5], dropout=0.5)


In [16]:
import torch

# Example text data (replace with your actual data)
input_text = data["]

# Tokenize and preprocess your input text (replace with your preprocessing code)
# You need to convert the text into numerical embeddings.

# For example, assuming you have a function preprocess_text(text) that converts text to embeddings:
# input_data = [preprocess_text(text) for text in input_text]

# Convert input_data to a tensor (assuming it's already properly formatted)
input_data = torch.Tensor(input_data)

# Pass the data through the model to get sentiment outputs
with torch.no_grad():
    model.eval()
    sentiment_outputs = model(input_data)

# sentiment_outputs is a tensor containing the sentiment predictions for each input sentence
# You can now work with sentiment_outputs as needed.


NameError: name 'input_data' is not defined