In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from networkx.algorithms.bipartite.basic import color
import re
from wordcloud import WordCloud

In [None]:
# Import data of reviewed abstracts:
gs_abstracts = pd.read_excel('3_title_and_abstract_screening/scholar_unique.xlsx',nrows=200)
scopus_abstracts = pd.read_excel('3_title_and_abstract_screening/scopus_unique.xlsx')
wos_abstracts = pd.read_excel('3_title_and_abstract_screening/wos_unique.xlsx')

print(gs_abstracts.shape, scopus_abstracts.shape, wos_abstracts.shape)

In [None]:
# Remove duplicates found in wos_abstracts:
wos_abstracts=wos_abstracts[~(wos_abstracts['Reason for inclusion or exclusion']=='Duplicated')]
wos_abstracts.shape

In [None]:
# Define the column names
col_names = ['Included (0 No, 1 Yes)', 'Reason for inclusion or exclusion', 'Authors', 'Title', 'Year', 'Source', 'ArticleURL', 'DOI']

# Select and rename columns for each dataset
selected_gs = gs_abstracts[col_names]
selected_scopus = scopus_abstracts[['Included (0 No, 1 Yes)', 'Reason for inclusion or exclusion', 'Authors', 'Title', 'Year', 'Source title', 'Link', 'DOI']]
selected_wos = wos_abstracts[['Included (0 No, 1 Yes)', 'Reason for inclusion or exclusion', 'Authors', 'Article Title', 'Publication Year', 'Source Title', 'DOI Link', 'DOI']]

# Rename columns to match col_names
selected_scopus.columns = col_names
selected_wos.columns = col_names

# Combine the datasets
selected_combined = pd.concat([selected_gs, selected_scopus, selected_wos], ignore_index=True, axis='index')
selected_combined.shape

## Analyses

### Descriptive Statistics

* Publication Trends: Count papers per year to see trends over time.
* Author Analysis: Identify the most prolific authors in the field.
* Journal Distribution: See which journals or conferences publish the most relevant studies.

In [None]:
# Publication Trends
# Count papers per year and sort by year
year_trend = selected_combined['Year'].value_counts()
# Remove 2025 since it is not a valid year
year_trend.drop(2025,inplace=True)
# Sort the values by year
year_trend = year_trend.reset_index().sort_values(by='Year',ignore_index=True)

In [None]:
# Create a plotly plot
fig_width=800; fig_height=600
fig = px.bar(
    year_trend,
    x='Year',
    y='count',
    title='Publications per Year',
    labels={'Year': 'Publication Year', 'count': 'Number of Publications'},
    color_discrete_sequence=['cornflowerblue']
)

# Update layout for better aesthetics
fig.update_layout(
    title={'text': 'Publications per Year', 'x': 0.5, 'xanchor': 'center', 'font': {'size': 24, 'family': 'Arial'}},
    xaxis_title={'text': 'Year of Publication', 'font': {'size': 18, 'family': 'Arial'}},
    yaxis_title={'text': 'Number of Publications', 'font': {'size': 18, 'family': 'Arial'}},
    xaxis={
        'tickangle': 270, 'tickfont': {'size': 16, 'family': 'Arial'},
        'showgrid': False, 'gridcolor': 'lightgrey',
        'range': [2013.5, 2024.5],
        'dtick': 1
    },
    yaxis={'tickfont': {'size': 16, 'family': 'Arial'},
    'showgrid': True, 'gridcolor': 'lightgrey',
    'range': [0, 110]
    },
    coloraxis_showscale=False,
    plot_bgcolor='white',
    margin={'l': 60, 'r': 20, 't': 60, 'b': 60},
    width=fig_width,
    height=fig_height
)
# Export the plot as png, pdf and svg
fig.write_image('3_title_and_abstract_screening/publication_trends.png', format='png')
fig.write_image('3_title_and_abstract_screening/publication_trends.pdf', format='pdf')
fig.write_image('3_title_and_abstract_screening/publication_trends.svg', format='svg')
fig.show()

In [None]:
# Author Analysis
# Count the number of papers per author
author_trend = selected_combined['Authors']
# Remove missing values
author_trend = author_trend.dropna()

In [None]:
# Sample data
data = author_trend

# Create DataFrame
df = pd.DataFrame(data)

# Function to normalize and split author names
def normalize_authors(authors):
    # Replace semicolons and commas with a common delimiter
    authors = re.sub(r'[;,]', ',', authors)
    # Remove full stops
    authors = authors.replace('.', '')
    # Split the authors by the common delimiter
    author_list = authors.split(',')
    # Strip whitespace and standardize format
    standardized_authors = []
    for author in author_list:
        author = author.strip()
        if re.match(r'^[A-Z]{1,2} [A-Z][a-z]+$', author):  # Format: Initial Lastname
            parts = author.split(' ')
            standardized_authors.append(f'{parts[1]} {parts[0]}')
        else:
            standardized_authors.append(author)
    return standardized_authors

# Apply the function to the DataFrame
df['Normalized Authors'] = df['Authors'].apply(normalize_authors)

# Flatten the list of authors and get unique names
all_authors = [author for sublist in df['Normalized Authors'] for author in sublist]

In [None]:
pd.Series(all_authors).value_counts()

In [None]:
# Journal Distribution
# Count the number of papers per journal
journal_trend = selected_combined['Source']
journal_trend.value_counts()

I guess the Author and Journal analyses are not very useful, since it is hard to distinguish the unique authors (lots of similar last names and hard to get the unique last-name-and-initials pairs) and journals (lots of ellipsis dots...) from the data.b

### Exclusion Analysis
* Top Reasons for Exclusion: See if certain recurring issues (e.g., "no time progression analysis") explain most rejections.


In [None]:
exclusion_reasons =  selected_combined['Reason for inclusion or exclusion'][selected_combined['Included (0 No, 1 Yes)']==0]

In [None]:
# Top Reasons for Exclusion
exclusion_reasons.value_counts()

In [None]:
# Sample data
data = exclusion_reasons

# Create DataFrame
df = pd.DataFrame(data)

# Function to tokenize and count words
def count_words(text_series):
    # Convert to lowercase
    text_series = text_series.str.lower()
    # Tokenize the text
    words = text_series.apply(lambda x: re.findall(r'\b\w+\b', x))
    # Flatten the list of words
    all_words = [word for sublist in words for word in sublist]
    return all_words

# Apply the function to the DataFrame
all_words = count_words(df['Reason for inclusion or exclusion'])

word_count=pd.Series(all_words).value_counts()

In [None]:
# Remove prepositions and conjunctions, and also drop "unrelated" since it is not informative. Same for "mentions".
filtered_words=word_count.drop(labels=['and','of','no','or','not','an','on','from','with','to','unrelated','mentions','mention',]).iloc[:20]
# Substitute "article" with "not an article" since it is used in the negative case.
filtered_words=filtered_words.rename(index={'article':'not an article'})
# Substitute "progression" with "no progression" since it is used in the negative case.
filtered_words=filtered_words.rename(index={'progression':'no progression'})
# Same for time or temporal
filtered_words=filtered_words.rename(index={'time':'no time','temporal':'no temporal'})
filtered_words

In [None]:
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(filtered_words)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('3_title_and_abstract_screening/exclusion_reasons_wordcloud.png', format='png', bbox_inches='tight', dpi=300)
plt.savefig('3_title_and_abstract_screening/exclusion_reasons_wordcloud.pdf', format='pdf', bbox_inches='tight')
plt.savefig('3_title_and_abstract_screening/exclusion_reasons_wordcloud.svg', format='svg', bbox_inches='tight')

plt.show()