In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import sklearn
import os

## Task 1: Sentiment Labeling

Augment `test(in).csv` with an additional column, sentiment label of employee message, using VADER from NLTK, which is good for social text communication and works well without training data.

- **Input dataset**: `test(in).csv` with columns `Subject`, `body`, `date`, `from`.
- **Text to analyze**: `body` (primary), with a fallback to `Subject` if body is missing/empty.
- **Model**: NLTK VADER `SentimentIntensityAnalyzer`.
- **Scoring**: For each message, compute `compound` score in [-1, 1].
- **Label mapping**:
  - **Positive** if compound ≥ 0.05
  - **Negative** if compound ≤ -0.05
  - **Neutral** otherwise

These default VADER settings are widely used. This makes the labeling criteria clear and reproducible without the use of external APIs or LLMs. Later on, we can swap in an LLM and keep the same label mapping.

The sentiment compound value will be used later for later parts of the project.

Outputs:
- Augmented dataframe with new columns `sentiment_label` in {Positive, Negative, Neutral} and `sentiment_compound` in {-1.0, 1.0}
- Saved as `test(labeled).csv` in the project root.


In [2]:
#Load the csv into a dataframe
df = pd.read_csv('test(in).csv')
df.head()

Unnamed: 0,Subject,body,date,from
0,EnronOptions Update!,EnronOptions Announcement\n\n\nWe have updated...,5/10/2010,sally.beck@enron.com
1,(No Subject),"Marc,\n\nUnfortunately, today is not going to ...",7/29/2010,eric.bass@enron.com
2,Phone Screen Interview - Shannon L. Burnham,"When: Wednesday, June 06, 2001 10:00 AM-11:00 ...",7/25/2011,sally.beck@enron.com
3,RE: My new work email,we were thinking papasitos (we can meet somewh...,3/25/2010,johnny.palmer@enron.com
4,Bet,Since you never gave me the $20 for the last t...,5/21/2011,lydia.delgado@enron.com


In [3]:
#Extract text to analyze (try body, fallback to Subject if no body)
text = df["body"]
if text is None:
    text = df["Subject"]
else:
    empty_mask = text.isna() | (text.astype(str).str.strip() == "")
    subject_fallback = df["Subject"]
    if subject_fallback is not None:
        text_series = text.mask(empty_mask, subject_fallback)

sia = SentimentIntensityAnalyzer()

# Compute compound score and map to labels based on default threshold
scores = text_series.fillna("").astype(str).apply(sia.polarity_scores)
compound = scores.apply(lambda d: d["compound"])  # type: ignore[index]

#Default thresholds for compound score
POS_THR = 0.05
NEG_THR = -0.05

def map_label(c: float) -> str:
    if c >= POS_THR:
        return "Positive"
    if c <= NEG_THR:
        return "Negative"
    return "Neutral"

labels = compound.apply(map_label)

# Attach to dataframe
if "sentiment_compound" in df.columns:
    df = df.drop(columns=["sentiment_compound"], errors="ignore")
if "sentiment_label" in df.columns:
    df = df.drop(columns=["sentiment_label"], errors="ignore")

df["sentiment_compound"] = compound
df["sentiment_label"] = labels

In [4]:
# Save augmented CSV and show summary
df.to_csv('test(labeled).csv', index=False)

display(df.head(10))

print("\nClass distribution:")
print(df["sentiment_label"].value_counts().to_string())


Unnamed: 0,Subject,body,date,from,sentiment_compound,sentiment_label
0,EnronOptions Update!,EnronOptions Announcement\n\n\nWe have updated...,5/10/2010,sally.beck@enron.com,0.8172,Positive
1,(No Subject),"Marc,\n\nUnfortunately, today is not going to ...",7/29/2010,eric.bass@enron.com,0.4215,Positive
2,Phone Screen Interview - Shannon L. Burnham,"When: Wednesday, June 06, 2001 10:00 AM-11:00 ...",7/25/2011,sally.beck@enron.com,0.0,Neutral
3,RE: My new work email,we were thinking papasitos (we can meet somewh...,3/25/2010,johnny.palmer@enron.com,0.0,Neutral
4,Bet,Since you never gave me the $20 for the last t...,5/21/2011,lydia.delgado@enron.com,0.25,Positive
5,RE: Favor,"sure, just call me the bank that delivers.\n \...",10/23/2011,eric.bass@enron.com,0.3182,Positive
6,MG Inventory Summaries,Inventory summaries for both MGL and MGMCC as ...,4/5/2010,kayne.coulter@enron.com,0.0,Neutral
7,Forgot the Attachment,Please print attachment and make sure that e:m...,4/21/2010,patti.thompson@enron.com,0.836,Positive
8,Garvin Brown - AXIA Sr. Power Scheduler,Please advise me of your interest in Garvin's ...,2/7/2010,sally.beck@enron.com,0.802,Positive
9,More Dallas ASE Information,The start time for Tuesday morning has been ch...,2/6/2010,kayne.coulter@enron.com,-0.1511,Negative



Class distribution:
sentiment_label
Positive    1533
Neutral      506
Negative     152
