In [None]:
import pandas as pd

# Load data
data = pd.read_csv("data.csv")

# Extract year from publication date
data['year'] = pd.to_datetime(data['publication_date'], format='%d/%m/%Y').dt.year

# Split multiple subject areas into lists
data['subjectArea'] = data['subjectArea'].apply(lambda x: x.split(';'))

# Explode subject areas and keywords into separate rows
data = data.explode('subjectArea').explode('keywords')

# Group by year and subject area to count keyword occurrences
keyword_trends = data.groupby(['year', 'subjectArea', 'keywords']).size().reset_index(name='count')

# Normalize keyword counts by total papers per year and subject area
total_counts = data.groupby(['year', 'subjectArea']).size().reset_index(name='total_papers')
keyword_trends = keyword_trends.merge(total_counts, on=['year', 'subjectArea'])
keyword_trends['normalized_count'] = keyword_trends['count'] / keyword_trends['total_papers']

# Define a threshold for identifying emerging keywords
EMERGING_THRESHOLD = 0.95

# Find emerging keywords for the most recent year
latest_year = keyword_trends['year'].max()
latest_trends = keyword_trends[keyword_trends['year'] == latest_year]
latest_trends['is_emerging'] = (latest_trends['normalized_count'] > EMERGING_THRESHOLD).astype(int)

# Extract emerging keywords for each subject area
emerging_keywords = (
    latest_trends[latest_trends['is_emerging'] == 1]
    .groupby('subjectArea')['keywords']
    .apply(lambda x: '; '.join(sorted(set(x))))
    .reset_index()
)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_trends['is_emerging'] = (latest_trends['normalized_count'] > EMERGING_THRESHOLD).astype(int)


In [35]:

# Save or print for statistical usage
emerging_keywords.columns = ['Subject Area', 'Emerging Keywords']
emerging_keywords.to_csv("emerging_keywords_statistical.csv", index=False)
print(emerging_keywords)

            Subject Area                                  Emerging Keywords
0         AGRI,BUSI,ECON  Area yield,Crop insurance,Insurance product de...
1              AGRI,CHEM  Chaya leaf,Electronic tongue,Metabolomics,Mole...
2              AGRI,EART  28S,Braconidae,COI,integrative taxonomy,new ge...
3         AGRI,EART,ARTS  Anthropocene,Bang Khun Thian,Charcoal analysis...
4         AGRI,EART,ENGI  3D slope,Limit analysis,Pseudo-static analysis...
..                   ...                                                ...
289       VETE,AGRI,IMMU  anopheline mosquito,genetic structure,identifi...
290            VETE,IMMU  EEHV,Epitopes,Glycoprotein B,Peripheral blood ...
291  VETE,MEDI,AGRI,IMMU  23S rRNA,Hemoplasma,LAMP,Mycoplasma suis,Pig, ...
292  VETE,MEDI,IMMU,BIOC  Anti-SARS-CoV-2 IgG,BNT162b2 vaccine,Child,Inf...
293            VETE,PHAR  Crocodylus porosus,estuarine crocodiles,intram...

[294 rows x 2 columns]
