In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import pandas as pd
import string
import nltk

# new
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize and remove stop words and short words (length < 3)
    tokens = [word for word in text.split() if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

# Load data from CSV
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Combine 'title' and 'abstract' into a single string per document
data['title'] = data['title'].fillna("")  # Handle missing values
data['abstract'] = data['abstract'].fillna("")  # Handle missing values
data['combined'] = data['title'] + " " + data['abstract']

# Apply preprocessing to the combined text
data['processed'] = data['combined'].apply(preprocess_text)

# Prepare the text data for TF-IDF
sentences = data['processed']

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Get the feature names (keywords)
keywords = vectorizer.get_feature_names_out()

# Extract and display top two keywords for each document
top_two_keywords = []  # To store top 2 keywords for each document

for idx, row in enumerate(tfidf_matrix):
    if(idx == 20):
        break
    scores = row.toarray()[0]  # Convert sparse matrix row to dense array
    # Pair keywords with their scores and sort them in descending order
    keyword_scores = sorted(zip(keywords, scores), key=lambda x: x[1], reverse=True)
    # Get the top 2 keywords
    top_keywords = [kw[0] for kw in keyword_scores[:2]]
    # Join top keywords as a string for the current documen
    print(", ".join(top_keywords))
    top_two_keywords.append(", ".join(top_keywords))
    

# Add the top two keywords to the dataframe
#data["Top_Two_Keywords"] = top_two_keywords

#data.head()

# Save the updated dataframe to a new CSV file
'''''
output_file = "data_with_top_two_keywords.csv"
data.to_csv(output_file, index=False)

print("Top two keywords extracted and saved to", output_file)
'''''

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VivoBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


alloys, corrosion
universities, creation
hyperkyphosis, owd
soils, farmlands
donation, organ
inductance, plasma
hrqol, tsh
ufrs, irradiated
aki, intensive
zikv, zika
pcnfs, asr
fall, falls
clif, aclf
aerial, elu
acpcpna, rna
mats, nanocellulose
moodle, thinking
felids, ovarian
contingency, contingencies
ndn, rsu


'\'\'\noutput_file = "data_with_top_two_keywords.csv"\ndata.to_csv(output_file, index=False)\n\nprint("Top two keywords extracted and saved to", output_file)\n'

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import pandas as pd
import string
import nltk
import re

# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation (except hyphens)
    text = text.translate(str.maketrans("", "", string.punctuation.replace("-", "")))
    # Replace alphanumeric terms like "Ti-30Zr-5Al-3V-xFe" with a space-separated term
    text = re.sub(r'([a-zA-Z0-9]+(?:[-][a-zA-Z0-9]+)+)', r' \1 ', text)
    # Tokenize and remove stop words, short words (length < 3), and acronyms (all uppercase)
    tokens = [word for word in text.split() if word not in stop_words and len(word) > 2 and not word.isupper()]
    return " ".join(tokens)

# Load data from CSV
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Combine 'title' and 'abstract' into a single string per document
data['title'] = data['title'].fillna("")  # Handle missing values
data['abstract'] = data['abstract'].fillna("")  # Handle missing values
data['combined'] = data['title'] + " " + data['abstract']

# Apply preprocessing to the combined text
data['processed'] = data['combined'].apply(preprocess_text)

# Prepare the text data for TF-IDF
sentences = data['processed']

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Get the feature names (keywords)
keywords = vectorizer.get_feature_names_out()

# Extract and display top two keywords for each document
top_two_keywords = []  # To store top 2 keywords for each document

for idx, row in enumerate(tfidf_matrix):
    if(idx == 30):
        break
    scores = row.toarray()[0]  # Convert sparse matrix row to dense array
    # Pair keywords with their scores and sort them in descending order
    keyword_scores = sorted(zip(keywords, scores), key=lambda x: x[1], reverse=True)
    # Get the top 2 keywords
    top_keywords = [kw[0] for kw in keyword_scores[:2]]
    # Join top keywords as a string for the current document
    print(", ".join(top_keywords))
    top_two_keywords.append(", ".join(top_keywords))

'''''
# Add the top two keywords to the dataframe
data["Top_Two_Keywords"] = top_two_keywords

# Save the updated dataframe to a new CSV file
output_file = "data_with_top_two_keywords.csv"
data.to_csv(output_file, index=False)

print("Top two keywords extracted and saved to", output_file)
'''''

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VivoBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


alloys, corrosion
universities, creation
hyperkyphosis, owd
soils, farmlands
donation, organ
inductance, tpf
hrqol, tsh
ufrs, irradiated
aki, intensive
zikv, zika
pcnfs, asr
fall, falls
clif, aclf
aerial, elu
acpcpna, rna
nanocellulose, mats
moodle, thinking
felids, ovarian
contingency, contingencies
ndn, rsu
zinc, anode
sepsis, bundles
avs, kriging
nanopore, dna
glutamate, extract
pmddx41, ddx41
ep4, cox
lbp, breaks
fuel, cell
spicy, taste


'\'\'\n# Add the top two keywords to the dataframe\ndata["Top_Two_Keywords"] = top_two_keywords\n\n# Save the updated dataframe to a new CSV file\noutput_file = "data_with_top_two_keywords.csv"\ndata.to_csv(output_file, index=False)\n\nprint("Top two keywords extracted and saved to", output_file)\n'