In [None]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_json("hf://datasets/yjernite/news-ai-labor-coverage/news_jobs_dataset_2022-12-01_2025-07-01.json")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22270 entries, 0 to 22269
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   title_s          22270 non-null  object        
 1   title_dl         22270 non-null  object        
 2   source_url       22270 non-null  object        
 3   authors          22270 non-null  object        
 4   snippet_s        22270 non-null  object        
 5   text             22270 non-null  object        
 6   date             22270 non-null  datetime64[ns]
 7   publish_date_dl  22270 non-null  object        
 8   url              22270 non-null  object        
 9   matches          22270 non-null  object        
dtypes: datetime64[ns](1), object(9)
memory usage: 1.7+ MB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Reset the index to include it as a column and make it permanent
df = df.reset_index()

In [None]:
# Take the first 10,000 rows from df and assign to sample_df
sample_df = df.copy()

# Display info to verify the new sample_df
print(sample_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22270 entries, 0 to 22269
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            22270 non-null  int64         
 1   title_s          22270 non-null  object        
 2   title_dl         22270 non-null  object        
 3   source_url       22270 non-null  object        
 4   authors          22270 non-null  object        
 5   snippet_s        22270 non-null  object        
 6   text             22270 non-null  object        
 7   date             22270 non-null  datetime64[ns]
 8   publish_date_dl  22270 non-null  object        
 9   url              22270 non-null  object        
 10  matches          22270 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 1.9+ MB
None


In [None]:
print(sample_df.columns)

Index(['index', 'title_s', 'title_dl', 'source_url', 'authors', 'snippet_s',
       'text', 'date', 'publish_date_dl', 'url', 'matches'],
      dtype='object')


In [None]:
%pip install ftfy

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


In [None]:
import re
from bs4 import BeautifulSoup
import ftfy

def clean_news_article(article_text):
    """
    Clean news articles into a single paragraph for sentiment analysis:
    - Removes all metadata, special characters, and excessive punctuation
    - Keeps only clean sentences with basic punctuation (. and ?)
    - Returns clean single paragraph text
    """
    if not isinstance(article_text, str) or not article_text.strip():
        return ""

    # Initial text normalization
    text = ftfy.fix_text(article_text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

    # Remove metadata patterns more aggressively
    metadata_patterns = [
        r'^\s*(by|written by|reported by|staff|correspondent|contributor)[^.!?]*[.!?]',
        r'(updated|published|posted|last updated|last modified)[^.!?]*[.!?]',
        r'\([^)]*ap[^)]*\)',  # Remove AP-style credits
        r'\[.*?\]|\(.*?\)|\<.*?\>|\{.*?\}',  # All brackets
        r'\b(photo|image|video|audio|footage|gallery|infographic):[^.!?]*',
        r'\b(share|comment|like|follow|subscribe|click|watch|read|view)[^.!?]*',
        r'@\w+|#\w+',  # Social media
        r'\b(facebook|twitter|instagram|linkedin|youtube)\b[^.!?]*',
        r'\b(read more|continue reading|related|more|also|advertisement|sponsored)[^.!?]*',
        r'\b(breaking|developing story|exclusive|analysis|opinion)[^.!?]*',
        r'^[a-z\s]*[—\-]\s*',  # Location tags
        r'[\|\:\+\"]',  # Remove all pipes, colons, plus signs and quotes
    ]

    for pattern in metadata_patterns:
        text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)

    # Remove dates and times (including time period markers)
    text = re.sub(r'\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b', ' ', text)
    text = re.sub(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\.?\s*\d{1,2},?\s*\d{4}\b', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d{1,2}\s*[ap]\.?m\.?\b', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'\.\s*[ap]\.?m\.?\b', ' ', text, flags=re.IGNORECASE)  # Handle cases where time was partially removed

    # Remove all numbers
    text = re.sub(r'\b\d+\b', ' ', text)

    # Clean bullet points and other special characters
    text = re.sub(r'[\-\*]\s*', ' ', text)  # Remove bullet points

    # Clean and normalize text
    text = re.sub(r'([.!?])\s+', r'\1 ', text)  # Normalize sentence spacing
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()

    # Remove excessive punctuation
    text = re.sub(r'[!,"]', ' ', text)  # Remove quotes, exclamation marks
    text = re.sub(r'([,.!?])\1+', r'\1', text)  # Remove duplicate punctuation
    text = re.sub(r'\s([,.!?])(\s|$)', r'\1 ', text)  # Fix spacing before punctuation
    text = re.sub(r'([a-z])\s*-\s*([a-z])', r'\1 \2', text)  # Fix hyphen spacing

    # Extract complete sentences
    sentences = []
    for sentence in re.split(r'(?<=[.!?])\s+', text):
        sentence = sentence.strip()
        # Only keep sentences with at least 3 words and proper punctuation
        if (len(sentence.split()) >= 3 and
            re.search(r'[.!?]$', sentence) and
            re.search(r'[a-z]', sentence)):
            # Clean sentence further
            sentence = re.sub(r'^[^a-z]*', '', sentence)
            sentence = sentence.capitalize()
            sentences.append(sentence)

    # Combine into paragraph with proper spacing
    paragraph = ' '.join(sentences)

    # Final cleaning
    paragraph = re.sub(r'\s([.,!?])(\s|$)', r'\1 ', paragraph)
    paragraph = re.sub(r'\s+', ' ', paragraph).strip()
    paragraph = paragraph.lower()

    return paragraph

# Example usage with long messy article
messy_article = """
BREAKING: Major Earthquake Strikes California [VIDEO]

By John Smith | Updated: July 15, 2023 5:30 PM PST

LOS ANGELES (AP) — A powerful magnitude 7.1 earthquake rocked Southern California
at 3:15 p.m. Friday, causing widespread damage across several counties.

Key developments:
- Epicenter near Ridgecrest (same area as 2019 quakes)
- Felt from LA to Las Vegas
- 20+ aftershocks recorded
- Why does this keep happening? Experts weigh in [INTERVIEW]

PHOTOS: See the damage (Warning: Graphic content)
VIDEO: Security cameras capture moment quake hit

"This was terrifying!" said Maria Gonzalez, 42, of Los Angeles.
"My entire kitchen collapsed - dishes everywhere! What are we supposed to do now?"

Officials warn:
1. Check for gas leaks immediately
2. Expect prolonged power outages
3. Avoid damaged structures
4. When will help arrive? "Soon" says governor

RELATED:
Earthquake preparedness guide (2019 update)
California's seismic history (archival footage)

Share your experience: #CAquake
Follow live updates: @CalEMA

Advertisement: Home insurance deals after quakes!

More on this developing story...
"""
clean_result = clean_news_article(messy_article)
print("=== CLEANED ARTICLE ===")
print(clean_result)

=== CLEANED ARTICLE ===
friday causing widespread damage across several counties. key developments epicenter near ridgecrest felt from la to las vegas aftershocks recorded why does this keep happening? experts weigh in photos see the damage said maria gonzalez of los angeles. my entire kitchen collapsed dishes everywhere what are we supposed to do now? check for gas leaks immediately. expect prolonged power outages. avoid damaged structures. when will help arrive? soon says governor .


In [None]:
import tqdm
from tqdm.auto import tqdm as tqdm_auto
tqdm_auto.pandas() # Use the imported tqdm_auto for pandas integration

sample_df['cleaned_text'] = sample_df['text'].progress_apply(clean_news_article)

print("\nStep 1 Verification: Preprocessing")
print("cleaned text:", sample_df['cleaned_text'].iloc[0][:200])
print("-" * 80)

--------------------------------------------------------------------------------


In [None]:
def clean_snipet(s):
    if s is None:
        return ""
    s = re.sub(r"<[^>]+>", " ", str(s))  # remove HTML tags
    s = re.sub(r"\s+", " ", s).strip()
    s = s.lower()
    return s

In [None]:
sample_df.columns

Index(['index', 'title_s', 'title_dl', 'source_url', 'authors', 'snippet_s',
       'text', 'date', 'publish_date_dl', 'url', 'matches', 'cleaned_text'],
      dtype='object')

In [None]:
sample_df['snippet_s'] = sample_df['snippet_s'].progress_apply(clean_snipet)
sample_df['title_dl'] = sample_df['title_dl'].progress_apply(clean_snipet)
print("\nStep 1 Verification: Preprocessing")
print("cleaned text:", sample_df['snippet_s'].iloc[0][:200])
print("-" * 80)

  0%|          | 0/22270 [00:00<?, ?it/s]

  0%|          | 0/22270 [00:00<?, ?it/s]


Step 1 Verification: Preprocessing
cleaned text: in this article we explore how ai is likely to affect employment and the distribution of income. we argue that ai will indeed reduce drastically the need of ...
--------------------------------------------------------------------------------


In [None]:
# Convert the 'date' column to a standard date format (YYYY-MM-DD)
sample_df['date'] = sample_df['date'].dt.date
display(sample_df['date'].head(3))

Unnamed: 0,date
0,1984-06-15
1,2014-08-06
2,2014-08-06


In [None]:
# sample_df1.head().to_dict()

In [None]:
# print(sample_df.iloc[2].to_markdown())

In [None]:
sample_df = sample_df[['index', 'date', 'title_dl', 'cleaned_text']]

In [None]:
print(f"Sample shape: {sample_df.shape}")
sample_df.info()

Sample shape: (22270, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22270 entries, 0 to 22269
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         22270 non-null  int64 
 1   date          22270 non-null  object
 2   title_dl      22270 non-null  object
 3   cleaned_text  22270 non-null  object
dtypes: int64(1), object(3)
memory usage: 696.1+ KB


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import tiktoken
import re

# Initialize tokenizer - use 'gpt-4' as proxy (similar tokenization)
try:
    tokenizer = tiktoken.encoding_for_model("gpt-4")
except KeyError:
    # Fallback to cl100k_base which works for most modern models
    tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens using tiktoken"""
    return len(tokenizer.encode(text))

def smart_truncate_article(article_text, max_tokens):
    """
    Truncates ONLY if article exceeds max_tokens, keeping most important sentences
    in ORIGINAL order. Returns unchanged text if already short enough.
    """
    # Clean text first (optional)
    article_text = re.sub(r'\s+', ' ', article_text).strip()

    # Skip processing if within limit
    if count_tokens(article_text) <= max_tokens:
        return article_text

    # Step 1: Split into sentences
    sentences = sent_tokenize(article_text)
    if len(sentences) <= 1:
        # Fallback for very long single-sentence texts
        return tokenizer.decode(tokenizer.encode(article_text)[:max_tokens])

    # Step 2: Calculate sentence importance (TF-IDF)
    try:
        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf = vectorizer.fit_transform(sentences)
        sentence_scores = tfidf.sum(axis=1).A1
    except ValueError:
        # Fallback if TF-IDF fails (e.g., all stop words)
        return " ".join(sentences[:int(len(sentences)*0.5)])  # Take first half

    # Step 3: Pair scores with original indices
    scored_sentences = [(score, idx) for idx, score in enumerate(sentence_scores)]

    # Step 4: Sort by score (highest first)
    scored_sentences.sort(reverse=True, key=lambda x: x[0])

    # Step 5: Select top sentences IN ORIGINAL ORDER until token limit
    selected_indices = sorted([idx for _, idx in scored_sentences])  # Maintain original order
    truncated_text = []
    current_tokens = 0

    for idx in selected_indices:
        sentence = sentences[idx]
        sentence_tokens = count_tokens(sentence)
        if current_tokens + sentence_tokens > max_tokens:
            break
        truncated_text.append(sentence)
        current_tokens += sentence_tokens

    # Fallback if no sentences were selected
    if not truncated_text:
        return tokenizer.decode(tokenizer.encode(article_text)[:max_tokens])

    return " ".join(truncated_text)

In [None]:
# Get the cleaned text from the third row
text_to_count = sample_df['cleaned_text'].iloc[2]

# Count the tokens using the defined function
token_count = count_tokens(text_to_count)

# Display the result
print(f"Token count for the text in sample_df['cleaned_text'].iloc[2]: {token_count}")

Token count for the text in sample_df['cleaned_text'].iloc[2]: 2500


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from tqdm import tqdm

# Assuming you have a DataFrame 'sample_df' with 'cleaned_text' column
# Apply truncation with progress bar
for idx, text in tqdm(sample_df['cleaned_text'].items(), total=len(sample_df), desc="Truncating articles"):
    # Skip processing if text is empty/NaN
    if pd.isna(text) or not str(text).strip():
        sample_df.at[idx, 'cleaned_text'] = ""
        continue

    # Apply truncation (2000 token limit)
    truncated = smart_truncate_article(str(text), max_tokens=1400)
    sample_df.at[idx, 'cleaned_text'] = truncated  # Update the original column with truncated text

# Verify results
print(f"\nTruncated text lengths: {sample_df['cleaned_text'].str.len().describe()}")
print("\nSample truncated text:")
print(f"Token count for the text in sample_df['cleaned_text'].iloc[2]: {token_count}")

Truncating articles: 100%|██████████| 22270/22270 [01:09<00:00, 318.68it/s]


Truncated text lengths: count    22270.000000
mean      4199.058105
std       2909.390360
min          0.000000
25%       1383.250000
50%       4014.000000
75%       7105.750000
max      10066.000000
Name: cleaned_text, dtype: float64

Sample truncated text:
Token count for the text in sample_df['cleaned_text'].iloc[2]: 2500





In [None]:
# Define the path to save the JSON file in Google Drive
output_path = '/content/drive/MyDrive/sentiment_analysis/sample_df.json'

# Save the DataFrame to JSON format
sample_df.to_json(output_path, orient='records', indent=2)

print(f"DataFrame successfully saved to {output_path}")

In [None]:
############################################################################

In [None]:
# ============================================
# DeepSeek API key & endpoint
# ============================================
DEEPSEEK_API_KEY = "Your api key"
# DEEPSEEK_URL = "https://api.deepseek.com/chat/completions"
print("DeepSeek API client initialized.")

DeepSeek API client initialized.


In [None]:
# =========================
# STEP 1 — Imports & Setup
# =========================
import time
import json
from openai import OpenAI

client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
# File to store partial results
OUTPUT_FILE = "/content/drive/MyDrive/sentiment_analysis/deepseek_ai_job_analysis.json"

In [None]:
# =========================
# STEP 2 — Load Dataset
# =========================
all_articles = sample_df.to_dict(orient="records")

# =========================
# STEP 3 — Resume Progress
# =========================
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        saved_data = json.load(f)
    processed_ids = {int(item["article_id"]) for item in saved_data}
else:
    saved_data = []
    processed_ids = set()

remaining_articles = [a for a in all_articles if a["index"] not in processed_ids]

print(f"Total articles: {len(all_articles)}")
print(f"Already processed: {len(processed_ids)}")
print(f"Remaining: {len(remaining_articles)}")

Total articles: 11270
Already processed: 217
Remaining: 11173


In [None]:
# =========================
# STEP 4 — Save Function
# =========================
def save_partial_results(results_list):
    """Append new batch results to JSON file."""
    global saved_data
    saved_data.extend(results_list)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(saved_data, f, indent=2, ensure_ascii=False)

In [None]:
# =========================
# STEP 5 — API Call Function
# =========================
def analyze_batch(batch_articles):
    """Send batch of articles to DeepSeek API."""
    prompt = """
You are an AI analyst extracting workforce and policy insights from articles. Follow these rules:

### Output Format (STRICT JSON):
{
  "ARTICLE_<id>": {
    "job_roles": [{
      "role": "(explicit job title or inferred role) | null if no evidence",
      "industry": "(e.g., 'tech', 'healthcare', 'education') | null if unspecified",
      "ai_impact": "augmented|at_risk|transformed_role|emerging_role|obsolete_role | null if unclear",
      "evidence": "(EXACT quote from text) | null if role is inferred without direct support",
      "sentiment_score": "(-1 to 1) | null if no sentiment detectable",
      "confidence": "(0 to 1) | null if evidence is too weak"
    }],
    "policy_recommendations": [{
      "category": "training_upskilling|ai_governance_ethics|innovation_incentives|worker_protection|economic_transition_support | null if unclear",
      "recommendations": ["(specific action from text)"] | [] if none
    }]
  }
}

### Rules:
1. **Job Roles**:
   - Roles MUST be explicitly named or strongly implied (e.g., "AI ethicists will be in demand" → role: "AI ethicist").
   - If no role/industry is mentioned, set to `null`.
   - `ai_impact` and `evidence` are REQUIRED. If missing, set to `null`.
   - `sentiment_score`/`confidence`: Only provide if calculable (e.g., "AI threatens jobs" → -0.6). Else, `null`.

2. **Policy Recommendations**:
   - Only include policies EXPLICITLY stated (e.g., "Policymakers should fund AI training").
   - If no recommendations exist, return `"policy_recommendations": []`.
   - Never infer policies without direct evidence.

3. **Strict Validation**:
   - If a field lacks evidence, set it to `null` or `[]` (for arrays).
   - Never hallucinate. Omit entire `job_roles` array if no roles exist.

### Example Outputs:
1. **Valid Role**:
   {
     "role": "AI ethicist",
     "industry": "tech",
     "ai_impact": "emerging_role",
     "evidence": "Demand for AI ethics professionals will grow by 2025.",
     "sentiment_score": 0.7,
     "confidence": 0.8
   }

2. **Incomplete Data**:
   {
     "role": "Data analyst",
     "industry": null,  // Not mentioned
     "ai_impact": "augmented",
     "evidence": "AI will enhance data analysis tasks.",
     "sentiment_score": 0.5,
     "confidence": null  // Subjective impact
   }

3. **No Policies**:
   "policy_recommendations": []
"""
    messages = [
        {"role": "system", "content": "You are an AI labor market analyst."},
        {"role": "user", "content": prompt},
        {"role": "user", "content": str(batch_articles)}
    ]

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        temperature=0.3
    )
    return response.choices[0].message.content

# =========================
# STEP 6 — Safe JSON Parsing
# =========================
def safe_json_parse(text):
    """Try to parse model output as JSON, fixing common errors."""
    try:
        # Remove any leading/trailing text outside JSON
        json_str = re.search(r"\{.*\}", text, re.DOTALL)
        if json_str:
            text = json_str.group(0)

        # Fix trailing commas
        text = re.sub(r",\s*}", "}", text)
        text = re.sub(r",\s*]", "]", text)

        return json.loads(text)
    except Exception as e:
        print(f"⚠️ JSON parse error: {e}")
        return None

# =========================
# STEP 7 — Process API Result
# =========================
def process_batch_result(batch_result, batch_articles):
    """Convert API result into structured list of dicts."""
    results_list = []
    parsed = safe_json_parse(batch_result)

    if parsed:
        if isinstance(parsed, dict):
            for article_key, analysis_data in parsed.items():
                article_id = int(article_key.split('_')[-1]) if '_' in article_key else None
                if article_id:
                    results_list.append({
                        "article_id": article_id,
                        "job_roles": analysis_data.get("job_roles", []),
                        "policy_recommendations": analysis_data.get("policy_recommendations", [])
                    })
    else:
        # If parsing fails, save None for all articles in batch
        for art in batch_articles:
            results_list.append({
                "article_id": art["index"],
                "job_roles": None,
                "policy_recommendations": None
            })

    return results_list

In [None]:
from tqdm import tqdm

In [None]:
# =========================
# STEP 8 — Batch Processing Loop
# =========================
BATCH_SIZE = 5
for i in tqdm(range(0, len(remaining_articles), BATCH_SIZE), desc="Processing Articles"):
    batch_articles = remaining_articles[i:i+BATCH_SIZE]
    try:
        api_result = analyze_batch(batch_articles)
        processed_results = process_batch_result(api_result, batch_articles)
        save_partial_results(processed_results)
        time.sleep(1)  # avoid rate limit
    except Exception as e:
        print(f"❌ Error in batch {i//BATCH_SIZE + 1}: {e}")
        save_partial_results([{
            "article_id": art["index"],
            "job_roles": None,
            "policy_recommendations": None
        } for art in batch_articles])
        time.sleep(3)  # backoff

print("✅ Processing complete. Results saved to", OUTPUT_FILE)

NameError: name 'tqdm' is not defined