In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Full-Economic-News-DFE-839861.csv', encoding='ISO-8859-1')

# Step 1: Keep only the relevant columns
df = df[['positivity', 'relevance', 'headline', 'text']]

# Step 2: Drop rows where any of the relevant columns have NaN or empty values
df_cleaned = df.dropna(subset=['positivity', 'relevance', 'headline', 'text'])

# Optionally, remove any rows where the text column is empty or contains only whitespace
df_cleaned = df_cleaned[df_cleaned['text'].str.strip().astype(bool)]

# Step 3: Create a new column 'sentiment' based on the positivity score
def categorize_sentiment(positivity):
    if 1 <= positivity <= 3:
        return 'negative'
    elif 4 <= positivity <= 6:
        return 'neutral'
    elif 7 <= positivity <= 9:
        return 'positive'
    else:
        return None  # In case there are out-of-range values

df_cleaned['sentiment'] = df_cleaned['positivity'].apply(categorize_sentiment)

# Display the cleaned and categorized DataFrame
print(df_cleaned[['positivity', 'sentiment', 'relevance', 'headline', 'text']])




      positivity sentiment relevance  \
0            3.0  negative       yes   
4            3.0  negative       yes   
5            3.0  negative       yes   
9            4.0   neutral       yes   
12           4.0   neutral       yes   
...          ...       ...       ...   
7973         7.0  positive       yes   
7974         3.0  negative       yes   
7984         8.0  positive       yes   
7987         5.0   neutral       yes   
7995         7.0  positive       yes   

                                               headline  \
0                 Yields on CDs Fell in the Latest Week   
4     Currency Trading: Dollar Remains in Tight Rang...   
5                  Stocks Fall Again; BofA, Alcoa Slide   
9     U.S. Dollar Falls Against Most Currencies; Dec...   
12                 Defending Yourself Against Deflation   
...                                                 ...   
7973  Housing Starts Grow, Raising Inflation Fears: ...   
7974  Profits Often Evasive In Stock Mart Ralli

In [4]:
import nltk
from nltk.tokenize import word_tokenize

# Download nltk's punkt tokenizer if you haven't already
nltk.download('punkt')

# Tokenize each headline in the dataframe
df['tokenized_headline'] = df['headline'].apply(word_tokenize)

print(df[['headline', 'tokenized_headline']])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pilug\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


                                               headline  \
0                 Yields on CDs Fell in the Latest Week   
1     The Morning Brief: White House Seeks to Limit ...   
2     Banking Bill Negotiators Set Compromise --- Pl...   
3     Manager's Journal: Sniffing Out Drug Abusers I...   
4     Currency Trading: Dollar Remains in Tight Rang...   
...                                                 ...   
7995  Sawyer Sees Strong Economy For 2 Years, Truce ...   
7996                   Oil's losses are airlines' gains   
7997  Full Senate to vote on Bernanke; PANEL ADVANCE...   
7998                          Reinventing Opportunities   
7999  Stocks Rise On News of Auto Output: Dow Climbs...   

                                     tokenized_headline  
0        [Yields, on, CDs, Fell, in, the, Latest, Week]  
1     [The, Morning, Brief, :, White, House, Seeks, ...  
2     [Banking, Bill, Negotiators, Set, Compromise, ...  
3     [Manager, 's, Journal, :, Sniffing, Out, Drug,...  
4