In [None]:
import json
import matplotlib.pyplot as plt 
import networkx as nx
import plotly.graph_objs as go
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
from ipywidgets import widgets, Output, VBox
import geopandas as gpd

In [None]:
%%html

<style>
    body {
        background-color: #f9f9f9;
        font-family: "Arial", sans-serif;
    }

    .plot-container {
        border: 1px solid #ccc;
        padding: 15px;
        border-radius: 5px;
        background-color: #fff;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1); /* Add shadow effect */
    }

    .plot-container .plotly {
        box-shadow: none; /* Remove shadow effect from Plotly graph itself */
    }

    .dash-db-content {
        font-family: 'Arial', sans-serif;
        color: #333;
    }

    h1, h2, h3 {
        color: #333;
    }

    .graph-box {
        border: 1px solid #ccc;
        border-radius: 5px;
        background-color: #fff;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1); /* Add shadow effect */
        margin-bottom: 20px; /* Add some spacing between graph boxes */
        padding: 15px; /* Add padding inside the graph box */
    }
</style>

In [140]:
# Load the JSON data
with open(r'professors_data.json', 'r') as file:
    data = json.load(file)

In [141]:
professors_data = data
professors_names = list(professors_data.keys())
publications_counts = [professors_data[name]['Paper_Count'] for name in professors_names]
citation_counts = [professors_data[prof]['Citation_Count'] for prof in professors_data]
professors_country = [{'Name': key, 'Country': value['Country']} for key, value in professors_data.items()]

# Graphs

## Dashboard 1: Professors

### Publication counts per professor

In [142]:
# Combine, sort by publication count in descending order, and unzip
combined = sorted(zip(professors_names, publications_counts), key=lambda x: x[1], reverse=True)
sorted_professors_names, sorted_publications_counts = zip(*combined)

fig = go.Figure(data=[
    go.Bar(x=sorted_professors_names, y=sorted_publications_counts, marker_color='skyblue')
])

fig.update_layout(
    title='Publication Counts per Professor',
    height=700, 
    xaxis_tickangle=-90,
    xaxis_title="Professor",
    yaxis_title="Number of Publications",
    template='plotly_white'
)

fig.show()

### H-index distribution

In [143]:
h_indices = [professors_data[name]['Index_H'] for name in professors_names]

fig = go.Figure(data=[
    go.Histogram(x=h_indices, nbinsx=10, marker_color='green', opacity=0.7)
])

fig.update_layout(
    title='Histogram of H-indices',
    xaxis_title="H-index",
    yaxis_title="Frequency",
    template='plotly_white',
    bargap=0  # Controls the gap between bars
)

fig.show()

### Citation count overtime

In [144]:
# Extracting year and citation data
publication_years = []
citation_counts = []
for prof_data in data.values():
    for paper in prof_data.get('Papers', []):
        year = paper.get('Year_of_Publication')
        if year:
            publication_years.append(year)
            citation_counts.append(paper.get('Citation_Count', 0))

# Calculate average citation count per year
avg_citation_counts = {}
for year, citation_count in zip(publication_years, citation_counts):
    if year in avg_citation_counts:
        avg_citation_counts[year].append(citation_count)
    else:
        avg_citation_counts[year] = [citation_count]

years = sorted(avg_citation_counts.keys())
avg_citations = [np.mean(avg_citation_counts[year]) for year in years]

# Create Plotly figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=years,
        y=avg_citations,
        mode='lines+markers',
        marker=dict(color='orange', size=8),
        line=dict(color='orange')
    )
)

fig.update_layout(
    title='Citation Impact Over Time',
    xaxis_title='Year',
    yaxis_title='Average Citation Count',
    template='plotly_white'
)

fig.show()


### Gender Distribution of Authors

In [145]:
# Extracting gender counts
gender_counts = {}
for prof_data in data.values():
    gender = prof_data.get('Gender')
    if gender:
        if gender in gender_counts:
            gender_counts[gender] += 1
        else:
            gender_counts[gender] = 1

# Create Plotly pie chart
fig = go.Figure(data=[go.Pie(labels=list(gender_counts.keys()), 
                             values=list(gender_counts.values()), 
                             hole=.3,  # Creates a donut-shaped pie chart
                             hoverinfo='label+percent', 
                             textinfo='value')])

fig.update_layout(
    title='Gender Distribution of Authors',
    template='plotly_white'
)

fig.show()


## Dashboard 2: Analysis of fields of study

In [146]:
# Publication Trends by Field
field_publications = {}
for prof_data in data.values():
    for paper in prof_data.get('Papers', []):
        fields = paper.get('Fields_of_Study', []) if paper.get('Fields_of_Study') is not None else []
        year = paper.get('Year_of_Publication')
        for field in fields:
            if field not in field_publications:
                field_publications[field] = {year: 1}
            else:
                if year in field_publications[field]:
                    field_publications[field][year] += 1
                else:
                    field_publications[field][year] = 1
                    
fields_growth_rate = {}
max_years = max(len(publications) for publications in field_publications.values())
for field, publications in field_publications.items():
    years = sorted(publications.keys())
    counts = [publications[year] for year in years]
    growth_rate = [(counts[i] - counts[i - 1]) / counts[i - 1] * 100 if i > 0 else 0 for i in range(len(counts))]
    growth_rate += [0] * (max_years - len(growth_rate))
    fields_growth_rate[field] = growth_rate[:max_years]

### Comparison between temporal analysis of fields growth 

In [147]:
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display

# Initialize the figure and output widget
fig = go.Figure()
plot_output = widgets.Output()
display(plot_output)  # This needs to be called only once

# Function to update the plot based on dropdown selections
def update_plot(field1, field2):
    with plot_output:
        plot_output.clear_output(wait=True)  # Clear the current output to ensure no old plots remain

        # Initialize a new figure to start fresh
        fig = go.Figure()

        years = list(range(1, max_years + 1))
        if field1:  # Add the first field if selected
            fig.add_trace(go.Scatter(x=years, y=fields_growth_rate[field1],
                                    mode='lines+markers', name=field1))
        if field2:  # Add the second field if selected
            fig.add_trace(go.Scatter(x=years, y=fields_growth_rate[field2],
                                    mode='lines+markers', name=field2))

        # Update layout only if at least one field is selected
        if field1 or field2:
            fig.update_layout(
                title='Temporal Analysis of Field Growth',
                xaxis_title='Year',
                yaxis_title='Growth Rate (%)',
                legend_title='Field',
                template='plotly_white'
            )
            fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
            fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightBlue')

        fig.show()

# Dropdown widgets for field selection
dropdown1 = widgets.Dropdown(options=[None] + list(fields_growth_rate.keys()), value=None, description='Field 1:', disabled=False)
dropdown2 = widgets.Dropdown(options=[None] + list(fields_growth_rate.keys()), value=None, description='Field 2:', disabled=False)

# Function to handle dropdown changes
def on_dropdown_change(change):
    update_plot(dropdown1.value, dropdown2.value)

# Observe changes in the dropdowns
dropdown1.observe(on_dropdown_change, names='value')
dropdown2.observe(on_dropdown_change, names='value')

# Arrange the dropdowns and the plot output in a vertical layout
dashboard = widgets.VBox([dropdown1, dropdown2])
display(dashboard)

# Initialize with no fields selected
update_plot(None, None)

Output()

VBox(children=(Dropdown(description='Field 1:', options=(None, 'Computer Science', 'Engineering', 'Mathematics…

### Citation count for each field of study

In [148]:
# Comparison of Citation Counts Across Fields
field_citation_counts = {}
for field, publications in field_publications.items():
    counts = sum(publications.values())
    field_citation_counts[field] = counts
    
# Sorting the fields by citation counts in descending order
sorted_fields = sorted(field_citation_counts.items(), key=lambda x: x[1], reverse=True)
sorted_field_names = [field[0] for field in sorted_fields]
sorted_citation_counts = [field[1] for field in sorted_fields]
    
# Creating the bar chart
fig = px.bar(x=sorted_field_names, y=sorted_citation_counts,
             labels={'x': 'Field of Study', 'y': 'Total Citation Counts'},
             title='Comparison of Citation Counts Across Fields')

# Enhancing the chart appearance
fig.update_layout(xaxis_tickangle=-45)

# Show the plot
fig.show()

In [150]:
import plotly.graph_objects as go
from ipywidgets import widgets, Output, VBox

# Plotting widget output
plot_output = Output()
display(plot_output)

def update_plot(selected_field):
    with plot_output:
        plot_output.clear_output(wait=True)  # Clear the current output
        fig = go.Figure()

        # Add data for selected field if any
        if selected_field:
            years = sorted(field_publications[selected_field].keys())
            counts = [field_publications[selected_field][year] for year in years]
            fig.add_trace(go.Scatter(x=years, y=counts, mode='lines+markers', name=selected_field))
        
        # Update plot layout
        fig.update_layout(
            title='Publication Trends by Field',
            xaxis_title='Year',
            yaxis_title='Number of Publications',
            legend_title='Field',
            template='plotly_white'
        )
        fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
        fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightBlue')

        fig.show()

# Dropdown widget for field selection
dropdown = widgets.Dropdown(
    options=[None] + list(field_publications.keys()),
    value=None,
    description='Field:',
    disabled=False
)

def on_dropdown_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        update_plot(change['new'])

# Observe changes in the dropdown
dropdown.observe(on_dropdown_change)

# Arrange the dropdown and the plot output in a vertical layout
dashboard = VBox([dropdown])
display(dashboard)

# Initialize with no field selected
update_plot(None)


Output()

VBox(children=(Dropdown(description='Field:', options=(None, 'Computer Science', 'Engineering', 'Mathematics',…

# Interactive Visualizations

### Ego-Network

In [111]:
import json
import networkx as nx
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create an empty figure widget with plotly
fig = go.FigureWidget()

# Dropdown widget
dropdown = widgets.Dropdown(
    options=list(professors_data.keys()),
    value=list(professors_data.keys())[0],
    description='Professor:',
    disabled=False,
)

# Function to update the network graph based on selected professor
def update_network(change):
    selected_prof = change['new']
    G = nx.Graph()
    professor_data = professors_data.get(selected_prof, {})
    co_authors = professor_data.get('Co-authors', {})

    # Populate your graph with edges
    for co_author in co_authors.keys():
        G.add_edge(selected_prof, co_author)

    # Position nodes using one of the layout options in NetworkX
    pos = nx.spring_layout(G)

    with fig.batch_update():
        fig.data = []  # Clear existing data
        
        # Extract node positions for plotting
        edge_x = []
        edge_y = []
        for edge in G.edges():
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.extend([x0, x1, None])  # line breaks
            edge_y.extend([y0, y1, None])  # line breaks

        # Create edge traces
        fig.add_trace(go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=0.5, color='#888'),
            hoverinfo='none',
            mode='lines'))

        # Create node traces
        node_x = []
        node_y = []
        for node in G.nodes():
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)

        fig.add_trace(go.Scatter(
            x=node_x, y=node_y,
            mode='markers',
            hoverinfo='text',
            marker=dict(
                showscale=True,
                color=[len(G.adj[node]) for node in G.nodes()],
                size=10,
                line=dict(width=2))))

        # Update layout
        fig.update_layout(
            title='<br>Network graph of co-authorships',
            titlefont_size=16,
            showlegend=False,
            hovermode='closest',
            margin=dict(b=20,l=5,r=5,t=40),
            annotations=[ dict(
                text="This graph represents the co-author network of the selected professor.",
                showarrow=False,
                xref="paper", yref="paper",
                x=0.005, y=-0.002 ) ],
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )

        # Set text for nodes
        for node in G.nodes():
            fig.data[1].text = list(G.nodes())

# Observe dropdown for changes
dropdown.observe(update_network, names='value')

# Display dropdown
display(dropdown)

# Call update function initially to display the initial graph
update_network({'new': dropdown.value})

# Display graph
display(fig)

Dropdown(description='Professor:', options=('Irene Finocchi', 'Michela Altieri', 'Lakshmi Balachandran Nair', …

FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color': '#888', 'width': 0.5},
              'mode': 'lines',
              'type': 'scatter',
              'uid': 'e6616e65-fbd1-4edb-a29b-1645c321a155',
              'x': [-0.006247990498508707, 0.10841492246363334, None,
                    -0.006247990498508707, -0.16551240261880937, None,
                    -0.006247990498508707, -0.9747547898599398, None,
                    -0.006247990498508707, -0.4818646180052734, None,
                    -0.006247990498508707, 0.34165144719826335, None,
                    -0.006247990498508707, 0.5608009772430983, None,
                    -0.006247990498508707, 0.862039699828944, None,
                    -0.006247990498508707, 0.6973161937088883, None,
                    -0.006247990498508707, 0.39061222009719687, None,
                    -0.006247990498508707, -0.4300285153141932, None,
                    -0.006247990498508707, 0.5096237549858077, None,
    

### Tree-map of citations distribution

In [112]:
rows = []
others_citations = 0

for professor_name, details in data.items():
    total_citations = details["Citation_Count"]

    # Track the total citations of papers to subtract from the professor's node
    paper_citations_sum = 0

    if "Papers" in details:
        for paper in details["Papers"]:
            paper_title = paper["Title"]
            paper_citations = paper.get("Citation_Count", 0)

            # Append a row for each paper
            rows.append({
                "labels": paper_title,
                "parents": professor_name,
                "values": paper_citations,
            })

            paper_citations_sum += paper_citations

    # Subtract paper citations from the professor's total to avoid double-counting
    remaining_citations = total_citations - paper_citations_sum

    # Check if the remaining citations are less than 100
    if remaining_citations < 100:
        others_citations += remaining_citations
    else:
        # Append the professor's node only if there are remaining citations
        rows.append({
            "labels": professor_name,
            "parents": "",
            "values": remaining_citations,
        })

# Add the "Others" category if there are citations
if others_citations > 0:
    rows.append({
        "labels": "Others",
        "parents": "",
        "values": others_citations,
    })

df = pd.DataFrame(rows)
df['values'].fillna(0, inplace=True)
df = df[df['values'] > 0]

fig = px.treemap(df, path=['parents', 'labels'], values='values',
                 color='values', hover_data=['labels'],
                 color_continuous_scale='RdBu',
                 title='Interactive Treemap of Citation Distribution')

fig.show()



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





### Geographical visualization of the professors origin

In [113]:
professors_country = [{'Name': key, 'Country': value['Country']} for key, value in professors_data.items()]
professor_df = pd.DataFrame(professors_country)
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
professor_df['Country'] = professor_df['Country'].replace({
    'Great Britain': 'United Kingdom',
    'USA': 'United States of America',
    'Korea': 'South Korea',
    'Bulgary': 'Bulgaria',
    'Holland': 'Netherlands'
})
merged = world.merge(professor_df, how="right", left_on="name", right_on="Country")
# Raggruppa per 'iso_a3' e conta i professori
professor_counts = merged.groupby('iso_a3').size().reset_index(name='Professor_Count')
# Unisci i conteggi al DataFrame originale
df_geo = merged.merge(professor_counts, on='iso_a3', how='left')


The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.



In [115]:
df_geo['latitude'] = None
df_geo['longitude'] = None 
df_geo.loc[df_geo['Country'] == "Albania", 'latitude'] = 41.153332
df_geo.loc[df_geo['Country'] == "Albania", 'longitude'] = 20.168331
df_geo.loc[df_geo['Country'] == "Greece", 'latitude'] = 39.074208
df_geo.loc[df_geo['Country'] == "Greece", 'longitude'] = 21.824312
df_geo.loc[df_geo['Country'] == "Italy", 'latitude'] = 41.871940
df_geo.loc[df_geo['Country'] == "Italy", 'longitude'] = 12.567380
df_geo.loc[df_geo['Country'] == "India", 'latitude'] = 20.595164
df_geo.loc[df_geo['Country'] == "India", 'longitude'] = 78.963060
df_geo.loc[df_geo['Country'] == "Israel", 'latitude'] = 31.046051
df_geo.loc[df_geo['Country'] == "Israel", 'longitude'] = 34.851612
df_geo.loc[df_geo['Country'] == "United States of America", 'latitude'] = 37.090240
df_geo.loc[df_geo['Country'] == "United States of America", 'longitude'] = -95.712891
df_geo.loc[df_geo['Country'] == "South Korea", 'latitude'] = 35.907757
df_geo.loc[df_geo['Country'] == "South Korea", 'longitude'] = 127.766922
df_geo.loc[df_geo['Country'] == "Germany", 'latitude'] = 51.165691
df_geo.loc[df_geo['Country'] == "Germany", 'longitude'] = 10.451526
df_geo.loc[df_geo['Country'] == "Turkey", 'latitude'] = 38.963745
df_geo.loc[df_geo['Country'] == "Turkey", 'longitude'] = 35.243322
df_geo.loc[df_geo['Country'] == "United Kingdom", 'latitude'] = 52.355518
df_geo.loc[df_geo['Country'] == "United Kingdom", 'longitude'] = -1.174320
df_geo.loc[df_geo['Country'] == "Bulgaria", 'latitude'] = 42.733883
df_geo.loc[df_geo['Country'] == "Bulgaria", 'longitude'] = 25.485830
df_geo.loc[df_geo['Country'] == "Netherlands", 'latitude'] = 52.132633
df_geo.loc[df_geo['Country'] == "Netherlands", 'longitude'] = 5.291266
df_geo.loc[df_geo['Country'] == "Chile", 'latitude'] = -35.675147
df_geo.loc[df_geo['Country'] == "Chile", 'longitude'] = -71.542969

In [117]:
# Crea il grafico iniziale
fig = go.FigureWidget(data=[go.Choropleth(
    locations=df_geo['iso_a3'],  # Usa il codice ISO A3 per i paesi
    z=df_geo['Professor_Count'],
    text=df_geo['Country'],
    colorscale='OrRd',
    autocolorscale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title='Number of Professors',
    zmin=0,  # Imposta il valore minimo della colorbar
    zmax=42,
    colorbar=dict(
        len=0.3,  # Imposta la lunghezza della colorbar a 50% dell'altezza del grafico
        lenmode='fraction'  # Lunghezza come frazione dell'altezza totale del grafico
    )
)])

fig.update_layout(
    width=1400,  # Larghezza del grafico in pixel
    height=1000,
    title_text='Global Distribution of Professors',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    )
)

# Dropdown per la selezione del paese, con opzione 'All'
country_dropdown = widgets.Dropdown(
    options=[('Select a Country', None), ('All', 'All')] + [(country, country) for country in df_geo['Country'].unique()],
    description='Country:',
)

# Widget di output per l'elenco dei professori
output_professors = widgets.Output()

def adjust_projection_scale(country):
    # Puoi definire una scala basata sull'area o altri parametri
    if country == 'India':
        return 4
    elif country == 'Chile':
        return 3
    elif country == 'United States of America':
        return 2   # Scala maggiore per grandi paesi
    elif country == 'Israel':
        return 15
    elif country == 'Albania':
        return 15
    elif country == 'Netherlands':
        return 10  # Scala maggiore per piccoli paesi
    else:
        return 5  # Scala di default

def on_country_change(change):
    with fig.batch_update():
        if change.new:
            if change.new == 'All':
                fig.data[0].locations = df_geo['iso_a3']
                fig.data[0].z = df_geo['Professor_Count']
                fig.data[0].text = df_geo['Country']
                fig.update_geos(center={"lat": 0, "lon": 0}, projection_scale=1)
            else:
                selected_country = df_geo[df_geo['Country'] == change.new]
                lat = selected_country['latitude'].values[0]
                lon = selected_country['longitude'].values[0]
                scale = adjust_projection_scale(change.new)
                fig.data[0].locations = selected_country['iso_a3']
                fig.data[0].z = selected_country['Professor_Count']
                fig.data[0].text = selected_country['Country']
                fig.update_geos(center={"lat": lat, "lon": lon}, projection_scale=scale)
    
    with output_professors:
        output_professors.clear_output()
        if change.new:
            if change.new == 'All':
                # Visualizza i dati di tutti i professori
                display(professor_df[['Name', 'Country']].style.hide(axis='index'))
            else:
                selected_professors = professor_df[professor_df['Country'] == change.new]
                display(selected_professors[['Name']].style.hide(axis='index'))

country_dropdown.observe(on_country_change, names='value')

# Disponi i widget orizzontalmente
display(widgets.HBox([widgets.VBox([country_dropdown, output_professors]), fig]))

HBox(children=(VBox(children=(Dropdown(description='Country:', options=(('Select a Country', None), ('All', 'A…

In [None]:
import mercury as mr

def custom_layout(widgets):
    # Custom layout logic goes here
    # Example: return a dictionary, grid, or any structure supported by Mercury
    layout = {
        "header": widgets['header'],
        "body": [widgets['input1'], widgets['output1']],
        "footer": widgets['footer']
    }
    return layout

app = mr.App(
    title="Data VIZ",
    description="Web project",
    layout=custom_layout,
    theme="dark",
    show_code=False,
    show_prompt=False,
    continuous_update=True,
    show_sidebar=True
)

app.run()