# Load data

In [1]:
import hopsworks

project = hopsworks.login()
fs = project.get_feature_store()

  from .autonotebook import tqdm as notebook_tqdm


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/196758
Connected. Call `.close()` to terminate connection gracefully.


## Load clustered papers

In [2]:
clustered_fg = fs.get_feature_group("acm_papers_clustered_last_year", 1)
papers_df = clustered_fg.read(read_options={"use_hive": True})
papers_df



Finished: Reading data from Hopsworks, using Hive (5.04s) 


Unnamed: 0,citation,abstract,publication_date,abstract_clean,cluster,x_coord,y_coord
0,"@article{10.1016/j.neucom.2023.01.006,\nauthor...",A growing worldwide consensus agrees that a gl...,2023-03-14,growing worldwide consensus agrees global ener...,2,26.580717,-23.146490
1,"@article{10.1016/j.knosys.2023.110323,\nauthor...",Cross-domain arrhythmia classification (CAC) a...,2023-03-15,cross domain arrhythmia cac aims transfer suff...,1,-14.391303,-13.226600
2,"@inproceedings{10.1145/3608298.3608302,\nautho...",The use of functional magnetic resonance imagi...,2023-10-18,use functional magnetic resonance imaging fmri...,0,7.290242,12.193990
3,"@article{10.1016/j.advengsoft.2022.103354,\nau...",Highlights\n•\nThe highlights of the article a...,2023-02-01,highlights • highlights article given kind per...,2,-0.356899,19.317638
4,"@article{10.1016/j.neunet.2022.11.006,\nauthor...",Gated spiking neural P (GSNP) model is a recen...,2023-01-01,gated spiking neural p gsnp recently developed...,7,5.531752,-16.695017
...,...,...,...,...,...,...,...
953,"@article{10.1016/j.compeleceng.2023.108604,\na...",Highlights\n•\nA new proposed algorithm for mu...,2023-03-01,highlights • new proposed multi recognition st...,6,2.876340,8.250215
954,"@inproceedings{10.1145/3573942.3573954,\nautho...",Ensembling is a popular and effective method f...,2023-05-16,ensembling popular effective improving ml prov...,5,-4.027958,23.535185
955,"@inproceedings{10.1145/3571560.3571565,\nautho...",Higher cognitive process efforts may result in...,2023-01-12,higher cognitive process efforts result mental...,5,8.739161,12.246270
956,"@article{10.1016/j.eswa.2022.119231,\nauthor =...",Approval of credit application is one of the c...,2023-04-01,approval credit application censorious busines...,2,25.051405,6.780630


### Parse citation to extract title and authors

In [3]:
import re


def extract_bibtex_field(bibtex_string, field):
    # Define the regular expression pattern for the specified field
    pattern = re.compile(rf"{field}\s*=\s*{{([^{{}}]*)}}", re.IGNORECASE)

    # Search for matches in the BibTeX string
    match = pattern.search(bibtex_string)

    # Extract the field value if a match is found
    value = match.group(1) if match else None

    return value


papers_df["title"] = papers_df["citation"].apply(
    lambda x: extract_bibtex_field(x, "title")
)
papers_df["author"] = papers_df["citation"].apply(
    lambda x: extract_bibtex_field(x, "author")
)
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   citation          958 non-null    object 
 1   abstract          958 non-null    object 
 2   publication_date  958 non-null    object 
 3   abstract_clean    958 non-null    object 
 4   cluster           958 non-null    int64  
 5   x_coord           958 non-null    float64
 6   y_coord           958 non-null    float64
 7   title             956 non-null    object 
 8   author            834 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 67.5+ KB


## Load cluster topics

In [4]:
keywords_fg = fs.get_feature_group("acm_papers_cluster_keywords_last_year", 1)
keywords_df = keywords_fg.read(read_options={"use_hive": True})
# sort by cluster
keywords_df.sort_values(by=["cluster"], inplace=True)
topics = keywords_df["keywords"].values.tolist()
topics



Finished: Reading data from Hopsworks, using Hive (1.37s) 


['conclusion, signal, signals, processing, classify, section, skip, proposed, objective, extraction, background, different, research, algorithms, random',
 'domains, source, adaptation, time, distribution, challenge, unsupervised, compared, generalization, framework, different, use, multiple, pseudo, information, art, knowledge',
 'proposed, selection, support, svm, rule, traffic, new, time, different, problem, ensemble, non, clients, recognition, level, credit, examples, early, research, shot, adversarial, software, novel, shape, attributes, number, knowledge, defined',
 'graph, risk, decision, instances, domain, proposed, improvement, node, graphs, trees, making, decisions, pooling, unknown, probability, long, uncertainty, open, bayesian, function, semi, layer, estimation, level, different, parameters, multi, dependencies, identification, including, second, available',
 'deep, cnn, images, level, multi, proposed, wise, image, instance, different, attention, information, diagnosis, ar

# Plot the clustered data

In [5]:
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    CustomJS,
    Slider,
    TapTool,
    TextInput,
)
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import TextInput, Div, Paragraph
from bokeh.layouts import row, layout

# show on notebook
output_notebook()

# data sources
source = ColumnDataSource(
    data=dict(
        x=papers_df["x_coord"],
        y=papers_df["y_coord"],
        x_backup=papers_df["x_coord"],
        y_backup=papers_df["y_coord"],
        abstract=papers_df["abstract"],
        title=papers_df["title"],
        author=papers_df["author"],
        publication_date=papers_df["publication_date"],
        cluster=papers_df["cluster"],
        labels=["C-" + str(x) for x in papers_df["cluster"]],
    )
)

max_cluster_value = papers_df["cluster"].max()
min_cluster_value = papers_df["cluster"].min()
clusters_count = max_cluster_value - min_cluster_value + 1

# hover over information
hover = HoverTool(
    tooltips=[
        ("Abstract", "@abstract{safe}"),
        ("Publication Date", "@publication_date"),
        ("Title", "@title"),
        ("Author", "@author"),
        ("Cluster", "@cluster"),
    ],
    point_policy="follow_mouse",
)

# map colors
mapper = linear_cmap(
    field_name="cluster",
    palette=Category20[clusters_count],
    low=min_cluster_value,
    high=max_cluster_value,
)

# prepare the figure
plot = figure(
    width=500,
    height=500,
    tools=[hover, "pan", "wheel_zoom", "box_zoom", "reset", "save", "tap"],
    title="Clustering of the ACM papers on Supervised Learning by Classification",
    toolbar_location="above",
)

# plot settings
plot.scatter(
    "x",
    "y",
    size=5,
    source=source,
    fill_color=mapper,
    line_alpha=0.3,
    line_color="black",
    legend="labels",
)
plot.legend.background_fill_alpha = 0.6





### Widgets

In [6]:
from plot.callbacks import input_callback, selected_code

# Keywords
text_banner = Paragraph(
    text="Keywords: Slide to specific cluster to see the keywords.", height=25
)
input_callback_1 = input_callback(plot, source, text_banner, topics)

# currently selected article
div_curr = Div(text="""Click on a plot to see the info about the article.""", width=150)
callback_selected = CustomJS(
    args=dict(source=source, current_selection=div_curr), code=selected_code()
)
tap_tool = plot.select(type=TapTool)
tap_tool.callback = callback_selected

# WIDGETS
slider = Slider(
    start=0,
    end=clusters_count,
    value=clusters_count,
    step=1,
    title="Cluster #",
    callback=input_callback_1,
)
keyword = TextInput(title="Search:", callback=input_callback_1)

# pass call back arguments
input_callback_1.args["text"] = keyword
input_callback_1.args["slider"] = slider

### Style

In [7]:
from plot.plot_text import (
    header,
    description,
    description_search,
    description_slider,
)

header.sizing_mode = "stretch_width"
header.style = {"color": "#2e484c", "font-family": "Julius Sans One, sans-serif;"}
header.margin = 5

description.style = {
    "font-family": "Helvetica Neue, Helvetica, Arial, sans-serif;",
    "font-size": "1.1em",
}
description.sizing_mode = "stretch_width"
description.margin = 5

description_slider.style = {
    "font-family": "Helvetica Neue, Helvetica, Arial, sans-serif;",
    "font-size": "1.1em",
}
description_slider.sizing_mode = "stretch_width"

description_search.style = {
    "font-family": "Helvetica Neue, Helvetica, Arial, sans-serif;",
    "font-size": "1.1em",
}
description_search.sizing_mode = "stretch_width"
description_search.margin = 5

slider.sizing_mode = "stretch_width"
slider.margin = 15

keyword.sizing_mode = "scale_both"
keyword.margin = 15

div_curr.style = {
    "color": "#BF0A30",
    "font-family": "Helvetica Neue, Helvetica, Arial, sans-serif;",
    "font-size": "1.1em",
}
div_curr.sizing_mode = "scale_both"
div_curr.margin = 20

text_banner.style = {
    "color": "#0269A4",
    "font-family": "Helvetica Neue, Helvetica, Arial, sans-serif;",
    "font-size": "1.1em",
}
text_banner.sizing_mode = "stretch_width"
text_banner.margin = 20
text_banner.height = 75

plot.sizing_mode = "scale_both"
plot.margin = 5

r = row(div_curr, text_banner)
r.sizing_mode = "stretch_width"

### Layout

In [8]:
from bokeh.plotting import output_file, save


l = layout(
    [
        [header],
        [description],
        [description_slider, description_search],
        [slider, keyword],
        [text_banner],
        [plot],
        [div_curr],
    ]
)
output_file('docs/test.html')
show(l)

2024-01-13 20:47:55,915 INFO: Session output file 'docs/test.html' already exists, will be overwritten.


'/Users/edward.nagy/university/id2223-project/docs/test.html'