In [None]:
import os
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re
import string
import nltk

# Custom stop words or subject-specific common words
subject_common_words = {
    "MEDI": ["medicine", "clinical", "health", "patient"],
    "ENGI": ["engineering", "design", "system"],
    "CHEM": ["chemistry", "reaction", "chemical"],
    "BIOC": ["biochemistry", "protein", "enzyme"],
    # Add more subject-specific common words as needed
}

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    return text.strip()

# Function to remove stopwords and common subject-specific terms
def preprocess_text(text, subject_areas):
    stop_words = set(stopwords.words('english'))
    custom_stop_words = []
    
    # Collect common words for all subject areas
    for area in subject_areas:
        custom_stop_words.extend(subject_common_words.get(area, []))
    
    tokens = word_tokenize(text)
    filtered_tokens = [
        word for word in tokens
        if word not in stop_words and word not in custom_stop_words
    ]
    return ' '.join(filtered_tokens)

# Function to extract keywords using TF-IDF
def extract_keywords(corpus, n_keywords=5):
    vectorizer = TfidfVectorizer(
        max_features=1000,  # Adjust based on your dataset size
        ngram_range=(1, 2),  # Use unigrams and bigrams
        stop_words='english'
    )
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        indices = row.indices
        scores = row.data
        top_indices = indices[scores.argsort()[-n_keywords:]]  # Get indices of top keywords
        top_keywords = [feature_names[idx] for idx in top_indices]
        keywords.append(top_keywords)
    return keywords

# Load data
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Ensure required columns exist
if not all(col in data.columns for col in ['title', 'abstract', 'subjectArea']):
    raise ValueError("CSV must contain 'title', 'abstract', and 'subjectArea' columns.")

# Preprocess text and combine title and abstract
data['cleaned_text'] = data.apply(
    lambda row: preprocess_text(
        clean_text(f"{row.get('title', '')} {row.get('abstract', '')}"),
        str(row.get('subjectArea', '')).split(';')
    ) if pd.notnull(row['title']) and pd.notnull(row['abstract']) else '',
    axis=1
)


# Generate keywords for each row
corpus = data['cleaned_text'].tolist()
data['extracted_keywords'] = extract_keywords(corpus)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\VivoBook/nltk_data'
    - 'c:\\Python312\\nltk_data'
    - 'c:\\Python312\\share\\nltk_data'
    - 'c:\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\VivoBook\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [18]:
import nltk
print(nltk.data.path)


['C:\\Users\\VivoBook/nltk_data', 'c:\\Python312\\nltk_data', 'c:\\Python312\\share\\nltk_data', 'c:\\Python312\\lib\\nltk_data', 'C:\\Users\\VivoBook\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [None]:
# Save to CSV
output_file = "data_with_added_keywords.csv"
data.to_csv(output_file, index=False, encoding='utf-8')

print(f"Processed data saved to {output_file}.")

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load data from CSV
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Combine 'title' and 'abstract' into a single string per document
data['title'] = data['title'].fillna("")  # Handle missing values
data['abstract'] = data['abstract'].fillna("")  # Handle missing values
data['combined'] = data['title'] + " " + data['abstract']

# Prepare the text data for TF-IDF
sentences = data['combined']

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Get the feature names (keywords)
keywords = vectorizer.get_feature_names_out()

# Extract and display top keywords for each document


for idx, row in enumerate(tfidf_matrix):
    print(f"\nDocument {idx + 1}:")
    if(idx == 10):
        break
    scores = row.toarray()[0]
    keyword_list = []
    keyword_scores = sorted(zip(keywords, scores), key=lambda x: x[1], reverse=True)[:10]  # Top 10 keywords
    print(keyword_scores)
   # print(keyword_scores[0][0] + ", " + keyword_scores[1][0])
    
    keyword_list.append(keyword_scores[0][0] +", " + keyword_scores[1][0])

data["Extracted_Keywords"] = keyword_list



Document 1:
[('alloys', np.float64(0.5160307141785417)), ('corrosion', np.float64(0.38858666869454345)), ('fe', np.float64(0.24193685685458194)), ('exhibit', np.float64(0.1818639526796358)), ('30zr', np.float64(0.1758473625019951)), ('3v', np.float64(0.1758473625019951)), ('xfe', np.float64(0.1758473625019951)), ('5al', np.float64(0.16872235263083385)), ('pitting', np.float64(0.1565420678050931)), ('rolled', np.float64(0.1565420678050931))]

Document 2:
[('universities', np.float64(0.36801088739380877)), ('creation', np.float64(0.3572813609781599)), ('innovation', np.float64(0.24221750329663097)), ('research', np.float64(0.20824515991655698)), ('critical', np.float64(0.1908058913577825)), ('public', np.float64(0.18423795507027274)), ('factors', np.float64(0.1824186202468366)), ('academic', np.float64(0.1611217173003767)), ('qualitative', np.float64(0.14391510494980608)), ('hei', np.float64(0.1436374599625223))]

Document 3:
[('hyperkyphosis', np.float64(0.5761952220858897)), ('owd', n

ValueError: Length of values (1) does not match length of index (16319)

In [None]:
data.head()