In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import string

# Load data from CSV
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Combine 'title' and 'abstract' into a single string per document
data['title'] = data['title'].fillna("")  # Handle missing values
data['abstract'] = data['abstract'].fillna("")  # Handle missing values
data['combined'] = data['title'] + " " + data['abstract']

# Define a custom text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text.strip()

# Apply text cleaning
data['cleaned_combined'] = data['combined'].apply(clean_text)

# Prepare the text data for TF-IDF
sentences = data['cleaned_combined']

# Initialize the TfidfVectorizer with additional parameters
vectorizer = TfidfVectorizer(
    max_features=1000,  # Limit the number of features for focus
    stop_words='english',  # Remove common English stopwords
    ngram_range=(1, 2),  # Include unigrams and bigrams for better context
    min_df=2,  # Filter terms that appear in fewer than 2 documents
    max_df=0.85  # Filter terms that appear in more than 85% of documents
)

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Get the feature names (keywords)
keywords = vectorizer.get_feature_names_out()

# Extract and display top keywords for each document
keyword_list = []
for idx, row in enumerate(tfidf_matrix):
    if(idx == 20):
        break
    print(f"\nDocument {idx + 1}:")
    scores = row.toarray()[0]
    keyword_scores = sorted(zip(keywords, scores), key=lambda x: x[1], reverse=True)[:10]  # Top 10 keywords
    print(keyword_scores)
    
    # Combine the top keywords into a comma-separated string
    top_keywords = ", ".join([kw[0] for kw in keyword_scores])
    keyword_list.append(top_keywords)

# Add the extracted keywords to the DataFrame
data["Extracted_Keywords"] = keyword_list




In [None]:
# Save the results to a new CSV file for further analysis
output_file = "data_with_keywords.csv"  # Replace with your desired output file name
data.to_csv(output_file, index=False)

print(f"Keyword extraction completed. Results saved to {output_file}.")