In [None]:
import json
import re

dataset = "/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json"

In [None]:
import pandas as pd

arxiv_df = pd.read_json(dataset, lines=True)
print(len(arxiv_df))
arxiv_df.drop(columns=['journal-ref', 'doi','report-no','submitter','license','comments'], inplace=True)
arxiv_df.dropna(inplace=True)
print(len(arxiv_df))

In [None]:
print (arxiv_df.columns)

In [None]:
 arxiv_df

In [None]:
def count_authors(authors_list):
    return len(authors_list)

arxiv_df['num_authors'] = arxiv_df['authors_parsed'].apply(count_authors)
arxiv_df

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Convert 'update_date' to datetime format
arxiv_df['update_date'] = pd.to_datetime(arxiv_df['update_date'])

# Extract year from the 'update_date' column
arxiv_df['year'] = arxiv_df['update_date'].dt.year

# Group by year and calculate the average author count
average_author_count_by_year = arxiv_df.groupby('year')['num_authors'].mean().reset_index()

# Plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='year', y='num_authors', data=average_author_count_by_year, color='skyblue',zorder=0)

# sns.regplot(x='year', y='num_authors', data=average_author_count_by_year, scatter=False, color='red')
sns.despine(offset=10, trim=False)
plt.title('Average authors count by year')
plt.xlabel('Year')
plt.ylabel('Average author count')
plt.xticks(rotation=45)
plt.grid(axis='y')  # Add gridlines on y-axis
plt.tight_layout()
plt.savefig('average_authors_count_by_year.png')
plt.show()

In [None]:
def count_versions(versions_list):
    return len(versions_list)

arxiv_df['num_versions'] = arxiv_df['versions'].apply(count_versions)

arxiv_df

In [None]:
# Group by year and calculate the average verisons count
average_versions_count_by_year = arxiv_df.groupby('year')['num_versions'].mean().reset_index()

# Plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='year', y='num_versions', data=average_versions_count_by_year, color='skyblue')
plt.title('Average versions count by year')
plt.xlabel('Year')
plt.ylabel('Average Vvrsions count')
plt.xticks(rotation=45)
plt.grid(axis='y')  # Add gridlines on y-axis
plt.tight_layout()
plt.savefig('average_versions_count_by_year.png')
plt.show()

In [None]:
!pip install beautifulsoup4
!pip install requests

In [None]:
import requests
from bs4 import BeautifulSoup

# Send an HTTP GET request to the URL
response = requests.get("https://arxiv.org/category_taxonomy")

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the div containing the category taxonomy
    taxonomy_div = soup.find('div', id='category_taxonomy_list')
    
    # Initialize an empty dictionary to store the sub-category and its father category
    category_dict = {}
    
    # Find all h2 tags (father categories)
#     father_categories = taxonomy_div.find_all(['h2','h3'], class_='accordion-head')
    father_categories_h2 = taxonomy_div.find_all('h2', class_='accordion-head')
    father_categories_h3 = taxonomy_div.find_all('h3', class_='column is-one-fifth')
    
    # Combine both lists of father categories
    father_categories = father_categories_h2 + father_categories_h3
    
    # Iterate over each father category
    for father_category in father_categories:
        # Extract father category name
        father_category_name = father_category.text.strip()
        
        # Find all h4 tags (sub-categories) within the current father category
        sub_categories = father_category.find_next_sibling().find_all('h4')
#         sub_categories = father_category.find_next_sibling().find_all(['h3', 'h4'])

        # Iterate over each sub-category
        for sub_category in sub_categories:
            # Extract sub-category name
            sub_category_name = sub_category.text.strip()
            
            # Extract sub-category code
            sub_category_code = sub_category_name.split()[0]
            
            # Add the sub-category and its father category to the dictionary
            category_dict[sub_category_code] = father_category_name

    # Display the dictionary
    print(category_dict)
else:
    # Display an error message if the request was not successful
    print("Failed to retrieve webpage content. Status code:", response.status_code)

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the website
url = "https://arxiv.org/category_taxonomy"

# Send an HTTP request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all relevant HTML elements (all <h4> elements)
    category_elements = soup.find_all(['h3','h4'])

    # Create a dictionary to store key-value pairs
    categories_dict = {}

    # Extract information and populate the dictionary
    for category_element in category_elements:
        category_key = category_element.text.strip().split()[0]
        
        # Check if there is a <span> element
        span_element = category_element.find('span')
        if span_element:
            category_value = span_element.text.strip()
        else:
            category_value = "No Value Found"
        
        if (category_value!="No Value Found"):
            categories_dict[category_key] = category_value.strip(')').strip('(')

    # Print the key-value pairs
    for key, value in categories_dict.items():
        print(f"Key: {key}")
        print(f"Value: {value}")
        print()

else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")

In [None]:
arxiv_df['sep_categories'] = arxiv_df['categories'].str.split()
arxiv_df= arxiv_df[arxiv_df['year']<2024]
# Explode the list into separate rows
df_exploded = arxiv_df.explode('sep_categories')

# Create a new column with the corresponding values from categories_dict
df_exploded['category'] = df_exploded['sep_categories'].str.strip().map(categories_dict)

# Display the updated DataFrame
print(len(df_exploded))

# arxiv_df['CategoryValue'] = arxiv_df['categories'].map(categories_dict)
# arxiv_df
df_exploded.dropna(inplace=True)
df_exploded

In [None]:
arxiv_df['count_topics'] = arxiv_df['categories'].apply(lambda x: len(x.split()))
arxiv_df

In [None]:
average_topic_count_by_year = arxiv_df.groupby('year')['count_topics'].mean().reset_index()

# Plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='year', y='count_topics', data=average_topic_count_by_year, color='skyblue',zorder=0)
plt.title('Average categories count by year')
plt.xlabel('Year')
plt.ylabel('Average categories count')
plt.xticks(rotation=45)
plt.grid(axis='y')  # Add gridlines on y-axis
plt.tight_layout()
plt.savefig('average_categories_count_by_year.png')
plt.show()

In [None]:
print(average_topic_count_by_year)

In [None]:
df_exploded['top_category'] = df_exploded['sep_categories'].apply(lambda x: category_dict[x])
df_exploded

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Group by 'year' and 'category' and count the number of publications
publications_by_category_year = df_exploded.groupby(['year', 'top_category']).size().reset_index(name='count')
publications_by_category_year['year'] = publications_by_category_year['year'].astype(str)

# Plot using Seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8))
sns.lineplot(x='year', y='count', hue='top_category', data=publications_by_category_year,markers=True, dashes=False,lw=2)
plt.title('Number of publications by category each year')
plt.xlabel('Year')
plt.ylabel('Number of publications')
plt.legend(title='Category', loc='upper left')
plt.xticks(rotation=45)
plt.grid(axis='y')  # Add gridlines on y-axis
plt.tight_layout()
plt.savefig('Number_of_Publications_by_Category_by_Year.png')
plt.show()

In [None]:
connected_cat = df_exploded[['id','category','year']]

connected_cat

In [None]:
publication_counts = connected_cat.groupby(['year', 'category']).size().reset_index(name='count')
publication_counts_sorted = publication_counts.sort_values(by=['year', 'count'], ascending=[True, False])
top_categories_each_year = publication_counts_sorted.groupby('year').head(3).reset_index()
top_10_categories_each_year = publication_counts_sorted.groupby('year').head(10).reset_index()
top_10_categories_each_year['category'].replace(" - ", ' ', regex=True, inplace=True)
top_categories_each_year['category'].replace(" - ", ' ', regex=True, inplace=True)

# Pivot the data to create stacked bar plot
pivot_df = top_categories_each_year.pivot(index='year', columns='category', values='count')

# Plot stacked bar plot
# plt.figure(figsize=(18, 10))
pivot_df.plot(kind='bar', stacked=True, colormap='Set3', width=0.8,figsize=(12, 6))
plt.title('Top 3 categories each year')
plt.xlabel('Year')
plt.ylabel('Number of publications')
plt.xticks(rotation=45)
plt.legend(title='Legend title', loc='upper left')  # Adjust legend position as needed

plt.tight_layout()
plt.savefig('Top 3 categories each year.png')
plt.show()

In [None]:
print(top_categories_each_year)

In [None]:
top_10_categories_each_year

In [None]:
# !pip install -U kaleido
# !pip install fiftyone -y

In [None]:
!pip install plotly
!conda install -c conda-forge python-kaleido -y

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

top_10_categories_each_year['v_cat'] = top_10_categories_each_year['category'].str.replace(' ', '<br>')

fig = go.Figure()
fig = px.treemap(top_10_categories_each_year, path=['year', 'v_cat'], values='count', title='Treemap of Categories by Year',branchvalues='total', width=1600, height=1200)
fig.update_traces(textinfo='label+value', selector=dict(type='treemap'),textfont_size=12, pathbar_textfont_size=12)
fig.show()
fig.write_image("test1.png",scale=6, width=1600, height=900)

In [None]:
connected_cat

In [None]:
most_common_category = connected_cat.groupby('year')['category'].agg(lambda x: x.mode()[0]).reset_index()
most_common_category
Plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='year', y='category', data=most_common_category, palette='viridis')
plt.title('Most Common Category Each Year')
plt.xlabel('Year')
plt.ylabel('Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
network_connected_cat = connected_cat.groupby('id')

In [None]:
import networkx as nx
import plotly.graph_objects as go
from itertools import combinations

# Create an empty graph
G = nx.Graph()

for idx in list(network_connected_cat.groups.keys()):
    cat_len = len(network_connected_cat.get_group(idx))
    if(cat_len>1):
        pairs = list(combinations(list(range(0, cat_len)), 2))
        for pair in pairs:
            category1=network_connected_cat.get_group(idx).iloc[pair[0]].category
            category2=network_connected_cat.get_group(idx).iloc[pair[1]].category
            G.add_edge(category1, category2)

# Calculate betweenness centrality for each node
node_betweenness = nx.betweenness_centrality(G)

# Sort nodes based on betweenness centrality and select the top 100
top_nodes = sorted(G.nodes(), key=lambda x: node_betweenness[x], reverse=True)[:40]

# Filter edges to include only those connected to the top nodes
filtered_edges = [(source, target) for source, target in G.edges() if source in top_nodes or target in top_nodes]

# Create a subgraph containing only the top nodes and their connected edges
G_sub = G.subgraph(top_nodes)

# Create positions for the nodes using a spring layout algorithm with a higher k value for a more sparse layout
pos_sub = nx.spring_layout(G_sub, k=2)

# Create edge trace for the filtered edges
edge_x = []
edge_y = []
for edge in G_sub.edges():
    x0, y0 = pos_sub[edge[0]]
    x1, y1 = pos_sub[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

# Create node trace for the top nodes
node_x = []
node_y = []
node_text = []
node_size = []
for node in G_sub.nodes():
    x, y = pos_sub[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)
    node_size.append(node_betweenness[node])

# Define the scaling factor and adjust the size range
scaling_factor = 10000
min_size = 3
max_size = 100

# Scale the node sizes based on the betweenness centrality
scaled_node_size = [min_size + (s * scaling_factor) for s in node_size]
scaled_node_size = [min(s, max_size) for s in scaled_node_size]

# Create edge trace
edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# Create node trace
node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    text=node_text,
    mode='markers+text',
    hoverinfo='none',
    marker=dict(
        color='#CB1111',
        size=scaled_node_size,
        line=dict(width=2)
    )
)

# Create figure
fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        title='Top 40 Connected Categories',
        titlefont=dict(size=8),
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=True),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=True)
    )
)

# Show the figure
plt.show()

In [None]:
pio.write_image(fig, 'top_40_connected_categories.png',width=1100, height=700, scale=6)

In [None]:
# Sort nodes based on betweenness centrality values
sorted_nodes = sorted(node_betweenness.items(), key=lambda x: x[1], reverse=True)

# Select top 10 nodes
top_10_nodes = sorted_nodes[:10]

# Extract node labels and betweenness centrality values
node_labels = [node[0] for node in top_10_nodes]
centrality_values = [node[1] for node in top_10_nodes]

# Plot bar graph
plt.figure(figsize=(10, 6))
plt.bar(node_labels, centrality_values, color='lightblue')
plt.title('Top 10 Categoires of Betweenness Centrality')
plt.xlabel('Category')
plt.ylabel('Betweenness Centrality')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the figure
plt.savefig('top_10_nodes_betweenness_centrality.png')

# Show the figure
plt.show()

In [None]:
!pip install bertopic
!pip install nltk

In [None]:
df_exp_cs = df_exploded[df_exploded['category']=='Machine Learning']
df_exp_cs = df_exp_cs[['id','abstract','year','category']]

print(len(df_exp_cs))

In [None]:
df_exp_cs

In [None]:
!pip install BERTopic

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bertopic import BERTopic
import vaex

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Tokenize the abstracts and remove stop words using NLTK
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the abstract column and convert to veax df
vaex_arxiv_df = vaex.from_pandas(df_exp_cs)
vaex_arxiv_df['cleaned_abstract'] = vaex_arxiv_df['abstract'].apply(preprocess_text)

In [None]:
# Convert the cleaned abstracts to a list
abstracts_list = vaex_arxiv_df['cleaned_abstract'].tolist()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import matplotlib.pyplot as plt

# Assuming is your Pandas DataFrame with the 'cleaned_abstract' column


# Vectorize the text data
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(abstracts_list)

# Apply online LDA
num_topics = 10  
batch_size = 1000  
lda_model = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=42, batch_size=batch_size)
lda_model.fit(X)

# Get the most common words for each topic
topic_words = []
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda_model.components_):
    top_words_idx = topic.argsort()[:-11:-1]
    topic_words.append([feature_names[i] for i in top_words_idx])

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Flatten the list of top words for all topics
all_topic_words = [word for topic in topic_words for word in topic]

# Count the frequency of each word
word_counts = Counter(all_topic_words)

# Sort the words by frequency
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Extract the top 20 words and their frequencies
top_words = [word[0] for word in sorted_words[:20]]
word_frequencies = [word[1] for word in sorted_words[:20]]

# Plot the horizontal bar chart
plt.figure(figsize=(10, 8))
plt.barh(top_words, word_frequencies, color='skyblue')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.title('Top common words across all topics in machine learning')
plt.gca().invert_yaxis()  # Invert y-axis to display the most frequent words at the top
plt.savefig('Most_common_topics_in_Machine_Learing.png')
plt.show()

In [None]:
from bertopic import BERTopic
import random


random.shuffle(abstracts_list)

# Select a random subset of 125,000 items
subset_abstracts_list = random.sample(abstracts_list, 10000)

# Initialize and fit the BERTopic model with the subset
topic_model = BERTopic(verbose=True, embedding_model="paraphrase-MiniLM-L12-v2", min_topic_size=50)
topics, _ = topic_model.fit_transform(subset_abstracts_list)

# Get information about the topics
topic_info = topic_model.get_topic_info()
print(len(topic_info))  # Check the number of topics

In [None]:
topic_model.get_topic_info().head(10)

In [None]:
topic_info = topic_model.get_topic_info().head(10)  # Get the top 10 topic information

# Save the topic information to a CSV file
topic_info.to_csv("topic_info.csv", index=False)

In [None]:
topic_model.visualize_barchart(top_n_topics=11)

In [None]:
# Assuming topic_model is already initialized and fitted
barchart = topic_model.visualize_barchart(top_n_topics=11)

# Save the visualization as an image file
barchart.write_image("topic_barchart.png", width=1920, height=1080)

In [None]:
topic_model.visualize_topics()

In [None]:
visualization = topic_model.visualize_topics()

# Save the visualization as an image file with high resolution (e.g., 1920x1080)
visualization.write_image("topic_visualization.png", width=1920, height=1080)

In [None]:
topic_info = topic_model.get_topic_info()

# Save the data to a text file
topic_info.to_csv("topic_data.txt", sep='\t', index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

# Assuming df is your Pandas DataFrame with the 'cleaned_abstract' column
# Convert the cleaned abstracts to a list
# abstracts_list = df['cleaned_abstract'].tolist()

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(abstracts_list)

# Apply LDA
num_topics = 10  # Adjust as needed
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(X)

# Get the TF-IDF scores for each term within each topic category
topic_term_tfidf = lda_model.components_ * lda_model.components_.max(axis=1)[:, np.newaxis]

# Plot the Term TF-IDF distribution per assigned topic category
plt.figure(figsize=(12, 8))
for topic_idx in range(num_topics):
    sorted_indices = topic_term_tfidf[topic_idx].argsort()[::-1][:10]
    sorted_terms = [vectorizer.get_feature_names_out()[idx] for idx in sorted_indices]
    sorted_scores = topic_term_tfidf[topic_idx, sorted_indices]
    plt.barh([f"Topic {topic_idx}"] * 10, sorted_scores, tick_label=sorted_terms, alpha=0.7)
plt.xlabel('TF-IDF Score')
plt.ylabel('Terms')
plt.title('Term TF-IDF Distribution per Assigned Topic Category')
plt.savefig('tf-idf.png')

plt.show()

Topics models