In [None]:
import os
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re
import string
import nltk

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Custom stop words or subject-specific common words
subject_common_words = {
    "MEDI": ["medicine", "clinical", "health", "patient"],
    "ENGI": ["engineering", "design", "system"],
    "CHEM": ["chemistry", "reaction", "chemical"],
    "BIOC": ["biochemistry", "protein", "enzyme"],
    # Add more subject-specific common words as needed
}

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    return text.strip()

# Function to remove stopwords and common subject-specific terms
def preprocess_text(text, subject_areas):
    stop_words = set(stopwords.words('english'))
    custom_stop_words = []
    
    # Collect common words for all subject areas
    for area in subject_areas:
        custom_stop_words.extend(subject_common_words.get(area, []))
    
    tokens = word_tokenize(text)
    filtered_tokens = [
        word for word in tokens
        if word not in stop_words and word not in custom_stop_words
    ]
    return ' '.join(filtered_tokens)

# Function to extract keywords using TF-IDF
def extract_keywords(corpus, n_keywords=5):
    vectorizer = TfidfVectorizer(
        max_features=1000,  # Adjust based on your dataset size
        ngram_range=(1, 2),  # Use unigrams and bigrams
        stop_words='english'
    )
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        indices = row.indices
        scores = row.data
        top_indices = indices[scores.argsort()[-n_keywords:]]  # Get indices of top keywords
        top_keywords = [feature_names[idx] for idx in top_indices]
        keywords.append(top_keywords)
    return keywords

# Load data
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Ensure required columns exist
if not all(col in data.columns for col in ['title', 'abstract', 'subjectArea']):
    raise ValueError("CSV must contain 'title', 'abstract', and 'subjectArea' columns.")

# Preprocess text and combine title and abstract
data['cleaned_text'] = data.apply(
    lambda row: preprocess_text(
        clean_text(f"{row['title']} {row['abstract']}"), 
        str(row['subjectArea']).split(';')  # Split multiple subject areas
    ),
    axis=1
)

# Generate keywords for each row
corpus = data['cleaned_text'].tolist()
data['extracted_keywords'] = extract_keywords(corpus)

In [None]:
# Save to CSV
output_file = "data_with_added_keywords.csv"
data.to_csv(output_file, index=False, encoding='utf-8')

print(f"Processed data saved to {output_file}.")