In [24]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [26]:
# Load the CSV file
df = pd.read_csv('Full-Economic-News-DFE-839861.csv', encoding='ISO-8859-1')

# Step 1: Keep only the relevant columns
df = df[['positivity', 'relevance', 'headline', 'text']]

# Step 2: Drop rows where any of the relevant columns have NaN or empty values
df_cleaned = df.dropna(subset=['positivity', 'relevance', 'headline', 'text'])

# Optionally, remove any rows where the text column is empty or contains only whitespace
df_cleaned = df_cleaned[df_cleaned['text'].str.strip().astype(bool)]

# Step 3: Create a new column 'sentiment' based on the positivity score
def categorize_sentiment(positivity):
    if 1 <= positivity <= 3:
        return 'negative'
    elif 4 <= positivity <= 6:
        return 'neutral'
    elif 7 <= positivity <= 9:
        return 'positive'
    else:
        return None  # In case there are out-of-range values

df_cleaned['sentiment'] = df_cleaned['positivity'].apply(categorize_sentiment)

# Display the cleaned and categorized DataFrame
print(df_cleaned[['positivity', 'sentiment', 'relevance', 'headline', 'text']])




      positivity sentiment relevance  \
0            3.0  negative       yes   
4            3.0  negative       yes   
5            3.0  negative       yes   
9            4.0   neutral       yes   
12           4.0   neutral       yes   
...          ...       ...       ...   
7973         7.0  positive       yes   
7974         3.0  negative       yes   
7984         8.0  positive       yes   
7987         5.0   neutral       yes   
7995         7.0  positive       yes   

                                               headline  \
0                 Yields on CDs Fell in the Latest Week   
4     Currency Trading: Dollar Remains in Tight Rang...   
5                  Stocks Fall Again; BofA, Alcoa Slide   
9     U.S. Dollar Falls Against Most Currencies; Dec...   
12                 Defending Yourself Against Deflation   
...                                                 ...   
7973  Housing Starts Grow, Raising Inflation Fears: ...   
7974  Profits Often Evasive In Stock Mart Ralli

In [27]:
import nltk
from nltk.tokenize import word_tokenize

# Download nltk's punkt tokenizer if you haven't already
nltk.download('punkt_tab')

def func(string):
    return string

# Tokenize each headline in the dataframe
df_cleaned['tokenized_headline'] = df_cleaned['headline'].apply(word_tokenize)

print(df_cleaned[['headline', 'tokenized_headline']])

                                               headline  \
0                 Yields on CDs Fell in the Latest Week   
4     Currency Trading: Dollar Remains in Tight Rang...   
5                  Stocks Fall Again; BofA, Alcoa Slide   
9     U.S. Dollar Falls Against Most Currencies; Dec...   
12                 Defending Yourself Against Deflation   
...                                                 ...   
7973  Housing Starts Grow, Raising Inflation Fears: ...   
7974  Profits Often Evasive In Stock Mart Rallies Gr...   
7984          Salomon Sounds a Wary Note on the Economy   
7987  The Great Terror; A massive new history of Hit...   
7995  Sawyer Sees Strong Economy For 2 Years, Truce ...   

                                     tokenized_headline  
0        [Yields, on, CDs, Fell, in, the, Latest, Week]  
4     [Currency, Trading, :, Dollar, Remains, in, Ti...  
5       [Stocks, Fall, Again, ;, BofA, ,, Alcoa, Slide]  
9     [U.S., Dollar, Falls, Against, Most, Currencie...  
1

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dhritigampa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
# Step 4: Train a Word2Vec model on tokenized headlines
model = Word2Vec(sentences=df_cleaned['tokenized_headline'], vector_size=100, window=5, min_count=2, sg=1)

# Step 5: Generate embeddings for each headline by averaging word vectors
def get_headline_vector(tokens, model):
    # Filter out words not in the vocabulary and calculate the mean of word vectors
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)

df_cleaned['headline_vector'] = df_cleaned['tokenized_headline'].apply(lambda tokens: get_headline_vector(tokens, model))

In [31]:
# Step 6: Prepare data for the classifier
X = np.vstack(df_cleaned['headline_vector'].values)  # Stack vectors into a 2D array
y = df_cleaned['sentiment']  # Target labels

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [32]:
# Step 9: Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Step 10: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.49
Classification Report:
              precision    recall  f1-score   support

    negative       0.36      0.14      0.20        72
     neutral       0.50      0.84      0.63       142
    positive       0.50      0.14      0.22        70

    accuracy                           0.49       284
   macro avg       0.45      0.37      0.35       284
weighted avg       0.47      0.49      0.42       284

