In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from pathlib import Path

import joblib
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import tiktoken
from matplotlib.gridspec import GridSpec
from pySankey.sankey import sankey
from scipy import stats

from semanticscholar import SemanticScholar
from statannotations.Annotator import Annotator
from statsmodels.formula.api import glm, ols
from textwrap import wrap
from tqdm import tqdm
from run_analysis import cr_has_en, has_en_mapping, load_chatgpt_trends, p2mark
from utils import compare_correlations

sns.set_theme(style="ticks")
encoding = tiktoken.get_encoding('cl100k_base')
pipe = joblib.load('data/ai_index_pipe.pkl')

### Loda data

In [None]:
category_mapping = json.load(open('data/category_mapping.json'))
daily_trend_df = pd.read_csv('data/chatgpt_goog_trend_daily.csv')
daily_trend_df.columns = ['date', 'daily_trends']
daily_trend_df['date'] = pd.to_datetime(daily_trend_df['date'])
daily_trend_df.set_index('date', inplace=True)
daily_trend_df.sort_index(inplace=True)

trend_df = load_chatgpt_trends()

final_figdir = Path('/path/to/Downloads')
print(final_figdir.is_dir())

begin_date = '2022-01-01'
today = '2024-04-01'
gpt_release_date = pd.to_datetime('2022-11-30')
writing_days = 30

dir = Path('data')
cls_dir = Path(dir / 'content_type')


def read_sdfs_raw():
    dfs = []
    for p in tqdm(cls_dir.glob('res*.pkl')):
        dfs.append(pd.read_pickle(p))
    s_df_wct = pd.concat(dfs, axis=0)
    return s_df_wct

In [None]:
feat_cols = [
    'bino_score_min',
    'bino_score_mean',
    'bino_score_var',
]

paperimpact_df = pd.read_pickle('data/df_articlejn_infos.pkl')
paperinfo_df = pd.read_hdf('data/allpaperinfo.hdf5')
paperinfo_df = paperinfo_df.loc[paperinfo_df['country_region'] != 'None'].copy(
)
paperinfo_df['afterChatGPT'] = (
    paperinfo_df['date'] >= gpt_release_date).astype(int)
category_mapping = json.load(open('data/category_mapping.json'))
fields_all = sorted(set(category_mapping.values()))
fields_all.remove('Interdisciplinary')
filed_names = {
    "Biological Sciences": "BioS",
    "Computer Science": "CompS",
    "Economics and Finance": "EcoF",
    "Engineering": "Eng",
    "Environmental Sciences": "EnvS",
    "Mathematical Sciences": "Math",
    "Medicine": "Med",
    "Neurosciences": "Neur",
    "Physical Sciences": "PhyS",
}


paperinfo_df['fields'] = paperinfo_df['category'].apply(
    lambda x: [category_mapping[c] for c in x.split(',')][0])
paperinfo_df = paperinfo_df.loc[paperinfo_df['fields'].isin(fields_all)]


df_impact = paperinfo_df.merge(
    paperimpact_df,
    right_on='_id',
    left_index=True
)
df_impact['bygpt_pred_prob'] = pipe.predict_proba(
    df_impact[feat_cols].values)[:, 1]

df_impact['date_passed'] = (pd.to_datetime(
    today) - df_impact['date']).dt.days


df_impact['influence_measure'] = df_impact['citations']
print('number of papers data (initial):', paperinfo_df.shape)
print('number of papers impact data (initial):', df_impact.shape)

ratio2drop = 0.01/100
df_impact = df_impact.loc[
    (df_impact['bino_score_mean'] > df_impact['bino_score_mean'].quantile(ratio2drop)) &
    (df_impact['bino_score_mean'] <
     df_impact['bino_score_mean'].quantile(1-ratio2drop))
].copy()

df_impact = df_impact.loc[
    df_impact['influence_measure'] < df_impact['influence_measure'].quantile(
        1-ratio2drop)
].copy()

print('number of papers impact data (final):', df_impact.shape)

df_impact_published = df_impact[
    df_impact['journal'].str.len() > 0].copy()

jn2rm = ('bioRxiv', 'ArXiv', 'Unknown', 'medRxiv', '4')
pattern = '|'.join(jn2rm)

df_impact_published = df_impact_published.loc[
    ~df_impact_published['journal'].str.contains(pattern, na=False)
].copy()
df_impact_published.to_excel('data/df_impact_published.xlsx')

print('number of *publised* papers impact data (final):', df_impact_published.shape)

print('There are',
      len(df_impact_published['journal'].unique()),
      'journals'
      )

published_paperinfo_df = paperinfo_df.loc[
    paperinfo_df.index.isin(df_impact_published.index)
].copy()

### Binoculars scores before and after the release of ChatGPT

In [None]:

def plot_fig1(
    df_paperinfo,
    gpt_release_date,
    daily_trend_df,
    weekl_trend_df,

    figname='fig1',
    final_figdir=final_figdir,
):
    score_df_bypaper_o = df_paperinfo.groupby('date').agg({
        'bino_score_min': 'mean',
        'bino_score_mean': 'mean',
        'bino_score_var': 'mean',
    }).reset_index().dropna().set_index('date')
    score_df_bypaper_o['afterChatGPT'] = (
        score_df_bypaper_o.index >= gpt_release_date).astype(int)
    writing_days = 30
    for col in score_df_bypaper_o.columns:
        if 'bino_score' in col:
            score_df_bypaper_o[f'{col}_r_mean'] = score_df_bypaper_o[col].shift(
                -(writing_days-1)).rolling(writing_days).mean()
            score_df_bypaper_o[f'{col}_r_std'] = score_df_bypaper_o[col].shift(
                -(writing_days-1)).rolling(writing_days).std()

    score_df_bypaper_daily = pd.concat(
        [score_df_bypaper_o, daily_trend_df],
        axis=1, join='inner',
    ).dropna()

    score_df_bypaper_o.dropna(inplace=True)

    score_df_bypaper_week = pd.concat(
        [score_df_bypaper_o, weekl_trend_df],
        axis=1, join='inner',
    ).dropna()

    score_df_bypaper_daily_after = score_df_bypaper_daily.loc[
        score_df_bypaper_daily['afterChatGPT'] == 1]
    for col in score_df_bypaper_daily.columns.tolist():
        if col != 'daily_trends' and 'r_mean' in col:
            res = stats.pearsonr(
                score_df_bypaper_daily_after['daily_trends'].values,
                score_df_bypaper_daily_after[col].values
            )
            print(f'trends-{col}: {res}')

    plt.close()
    plt.clf()
    ylabels = [
        'bino_score_var',
        'bino_score_mean',
        'bino_score_min']

    colors = plt.rcParams['axes.prop_cycle'].by_key()['color'][:len(ylabels)]

    fig, axes = plt.subplots(len(ylabels), 3, figsize=[9, 5], sharex='col',
                             gridspec_kw={'width_ratios': [4.5, 1.2, 1.2],
                                          }
                             )

    df = score_df_bypaper_week

    for axidx, ylabel in enumerate(ylabels):
        ax = axes[axidx, 0]
        ax2 = ax.twinx()

        line_color = colors[axidx]
        ax.plot(
            df.index,
            df[f'{ylabel}_r_mean'].values,
            lw=2, color=line_color, label=ylabel,
        )

        lighter_color = mcolors.to_rgba(line_color, alpha=0.1)

        ste = df[f'{ylabel}_r_std'] / np.sqrt(writing_days)
        ax.fill_between(df.index,
                        df[f'{ylabel}_r_mean'] - ste,
                        df[f'{ylabel}_r_mean'] + ste,
                        color=lighter_color,
                        )

        ax.axvline(
            x=gpt_release_date,
            color='red'
        )

        c_trend = 'gray'
        ax2.plot(
            df.index,
            df['trends'].values,
            c=c_trend,
            lw=2,
            label='ChatGPT\n trends',
            zorder=0,
        )
        if axidx == 0:
            ax2.legend(frameon=False, loc='upper left')

        ax2.spines["right"].set_edgecolor(c_trend)
        ax2.tick_params(axis='y', colors=c_trend)

        ax.set_xlim(df.index[0],
                    df.index[-1])

        ax.set_ylabel(ylabel, c=line_color)
        if 'mean' in ylabel:
            s_name = 'mean'
        elif 'min' in ylabel:
            s_name = 'min'
        elif 'var' in ylabel:
            s_name = 'var'
        ax.set_ylabel(s_name+' of \n $\it{Binoculars}$', c=line_color)
        if axidx == 0:
            ax.annotate('The release of ChatGPT', color='red', xy=(
                gpt_release_date, 1.1),
                xycoords=('data', 'axes fraction'), ha='left', va='center')

        corr = stats.pearsonr(
            score_df_bypaper_daily_after['daily_trends'].values,
            score_df_bypaper_daily_after[ylabel].values,
        )
        ax_box = axes[axidx, 1]
        tfm = {
            0: 'Before',
            1: 'After',
        }
        score_df_bypaper_daily['aftergpt'] = score_df_bypaper_daily[
            'afterChatGPT'].apply(
            lambda x: tfm[x])

        sns.boxplot(data=score_df_bypaper_daily,
                    x='aftergpt',
                    y=ylabel,
                    ax=ax_box,
                    dodge=False,
                    showfliers=False,
                    color=line_color,
                    )
        stat, p = stats.mannwhitneyu(
            score_df_bypaper_daily.loc[
                score_df_bypaper_daily[
                    'afterChatGPT'] == 0][ylabel],
            score_df_bypaper_daily.loc[
                score_df_bypaper_daily[
                    'afterChatGPT'] == 1][ylabel],
        )
        ax_box.set_title(f'{p2mark(p)}')

        ax_sc = axes[axidx, 2]
        lower_bound = score_df_bypaper_daily_after[ylabel].quantile(0.001)
        upper_bound = score_df_bypaper_daily_after[ylabel].quantile(0.999)
        df_filtered = score_df_bypaper_daily_after[(score_df_bypaper_daily_after[ylabel] >= lower_bound) & (
            score_df_bypaper_daily_after[ylabel] <= upper_bound)]

        sns.regplot(data=df_filtered,
                    x='daily_trends', y=ylabel, ax=ax_sc,
                    line_kws=dict(color=line_color,),
                    scatter_kws={'alpha': 0.1, 's': 10, 'color': line_color,
                                 }
                    )

        ax_sc.set_title(f'{p2mark(corr.pvalue)}')
        ax_sc.set_xlabel('')
        ax_box.set_xlabel('')
        ax_sc.set_ylabel('')
        ax_box.set_ylabel('')

        if axidx == 2:
            ax_sc.set_xlabel('Daily trends\n(After release)')
            ax_box.set_xlabel('ChatGPT\nrelease')
        ax.tick_params(axis='x', labelrotation=90)
    fig.tight_layout()
    fig.savefig(final_figdir / f'{figname}.png', dpi=400,
                bbox_inches='tight', transparent=True)
    return fig


def plot_fig14all(
    df_paperinfo,
    gpt_release_date,
    daily_trend_df,
    weekl_trend_df,
    figname='fig1',
    final_figdir=final_figdir,
):

    score_df_bypaper_o = df_paperinfo.groupby('date').agg({
        'bino_score_min': 'mean',
        'bino_score_mean': 'mean',
        'bino_score_var': 'mean',
    }).reset_index().dropna().set_index('date')
    score_df_bypaper_o['afterChatGPT'] = (
        score_df_bypaper_o.index >= gpt_release_date).astype(int)
    writing_days = 30
    for col in score_df_bypaper_o.columns:
        if 'bino_score' in col:
            score_df_bypaper_o[f'{col}_r_mean'] = score_df_bypaper_o[col].shift(
                -(writing_days-1)).rolling(writing_days).mean()
            score_df_bypaper_o[f'{col}_r_std'] = score_df_bypaper_o[col].shift(
                -(writing_days-1)).rolling(writing_days).std()

    score_df_bypaper_daily = pd.concat(
        [score_df_bypaper_o, daily_trend_df],
        axis=1, join='inner',
    ).dropna()

    score_df_bypaper_o.dropna(inplace=True)

    score_df_bypaper_week = pd.concat(
        [score_df_bypaper_o, weekl_trend_df],
        axis=1, join='inner',
    ).dropna()

    score_df_bypaper_daily_after = score_df_bypaper_daily.loc[
        score_df_bypaper_daily['afterChatGPT'] == 1]
    for col in score_df_bypaper_daily.columns.tolist():
        if col != 'daily_trends' and 'r_mean' in col:
            res = stats.pearsonr(
                score_df_bypaper_daily_after['daily_trends'].values,
                score_df_bypaper_daily_after[col].values
            )
            print(f'trends-{col}: {res}')

    plt.close()
    plt.clf()
    label_cfg = dict(fontsize='large', va='bottom',  fontweight='bold')
    ylabels = [
        'bino_score_var',
        'bino_score_mean',
        'bino_score_min']

    colors = plt.rcParams['axes.prop_cycle'].by_key()['color'][:len(ylabels)]
    fig = plt.figure(figsize=(9, 5*2.2))
    trans = mtransforms.ScaledTranslation(-20/72, 7/72, fig.dpi_scale_trans)

    height_ratios = [1.5, 1.5, 1, 1, 1]
    gs1 = GridSpec(5, 2, width_ratios=[
                   3, 1], height_ratios=height_ratios, wspace=.5, hspace=0.25)
    gs2 = GridSpec(5, 3, width_ratios=[
                   1, 1, 1], height_ratios=height_ratios, wspace=.5, hspace=0.25)
    gs3 = GridSpec(5, 3, width_ratios=[
                   3, 1, 1], height_ratios=height_ratios, wspace=.75, hspace=0.25)

    axes4workflow = np.array([
        fig.add_subplot(gs1[0, 0]),
        fig.add_subplot(gs1[0, 1]),
    ])
    ax = axes4workflow[0]
    ax.text(0.0, 1.0, 'a', transform=ax.transAxes + trans, **label_cfg)
    ax.imshow(plt.imread(final_figdir/'workflow.png'))
    ax.axis('off')

    ax = axes4workflow[1]
    ax.imshow(plt.imread(final_figdir/'binocular_indicator.png'))
    ax.axis('off')
    ax.text(0.0, 1.0, 'b', transform=ax.transAxes + trans, **label_cfg)


    axes4dsdescribe = np.array([
        fig.add_subplot(gs2[1, 0]),
        fig.add_subplot(gs2[1, 1]),
        fig.add_subplot(gs2[1, 2]),
    ])
    bbox_to_anchor = (0.85, 0, 0.15, 1)
    ax = axes4dsdescribe[0]
    pie_fields = paperinfo_df['platform'].value_counts()
    wedges, texts, autotexts = ax.pie(pie_fields,
                                      autopct='%1.1f%%',
                                      textprops={'color': "w"},
                                      )
    pos_origin = ax.get_position()
    offset = pos_origin.x0 / 1.5
    print('original_pos', pos_origin)
    ax.set_position([ax.get_position().x0-offset, ax.get_position().y0,
                    ax.get_position().width, ax.get_position().height])

    labels = [f'{label}' for label, percentage in zip(
        pie_fields.index, pie_fields/pie_fields.sum()*100)]
    ax.legend(wedges, labels, title="Platforms",
              loc="center left",
              bbox_to_anchor=bbox_to_anchor,
              frameon=False,
              fontsize='small', labelspacing=0.1)
    ax.text(0.0, 1.0, 'c', transform=ax.transAxes + trans, **label_cfg)

    ax = axes4dsdescribe[1]
    pie_fields = paperinfo_df['fields'].value_counts()
    pie_fields = paperinfo_df['fields'].value_counts()
    pie_fields.index = [filed_names[idx] for idx in pie_fields.index]

    wedges, texts, autotexts = ax.pie(pie_fields,
                                      autopct='%1.1f%%',
                                      textprops={'color': "w"},
                                      colors=colors
                                      )
    ax.set_position([ax.get_position().x0-offset, ax.get_position().y0,
                    ax.get_position().width, ax.get_position().height])

    labels = [f'{label}' for label, percentage in zip(
        pie_fields.index, pie_fields/pie_fields.sum()*100)]
    ax.legend(wedges, labels, title="Domains",
              loc="center left",
              bbox_to_anchor=bbox_to_anchor,
              frameon=False,
              fontsize='small', labelspacing=0.1)
    ax.set_ylabel('')
    ax.text(0.0, 1.0, 'd', transform=ax.transAxes + trans, **label_cfg)

    ax = axes4dsdescribe[2]
    topk_countries = 8
    country_regions = paperinfo_df.reset_index().groupby('country_region').agg({
        '_id': 'size',
    }).reset_index().sort_values('_id', ascending=False).iloc[:topk_countries].country_region.values.tolist()
    if 'Others' in country_regions:
        country_regions.remove('Others')

    paperinfo_df.loc[~paperinfo_df['country_region'].isin(
        country_regions), 'country_region'] = 'Others'
    pie_country_regions = paperinfo_df['country_region'].value_counts()

    wedges, texts, autotexts = ax.pie(pie_country_regions,
                                      autopct='%1.1f%%',
                                      textprops={'color': "w"},
                                      )
    ax.set_position([ax.get_position().x0-offset, ax.get_position().y0,
                    ax.get_position().width, ax.get_position().height])

    labels = [f'{label}' for label, percentage in zip(
        pie_country_regions.index, pie_country_regions/pie_country_regions.sum()*100)]
    ax.legend(wedges, labels, title="Countries/\nRegions",
              loc="center left",
              bbox_to_anchor=bbox_to_anchor,
              frameon=False,
              fontsize='small', labelspacing=0.1)
    ax.set_ylabel('')
    ax.text(0.0, 1.0, 'e', transform=ax.transAxes + trans, **label_cfg)

    axes = np.array([
        [
            fig.add_subplot(gs3[2, 0]),
            fig.add_subplot(gs3[2, 1]),
            fig.add_subplot(gs3[2, 2]),
        ],
        [
            fig.add_subplot(gs3[3, 0]),
            fig.add_subplot(gs3[3, 1]),
            fig.add_subplot(gs3[3, 2]),
        ],
        [
            fig.add_subplot(gs3[4, 0]),
            fig.add_subplot(gs3[4, 1]),
            fig.add_subplot(gs3[4, 2]),
        ],
    ])
    df = score_df_bypaper_week

    ax = axes[0, 0]
    ax.text(0.0, 1.0, 'f', transform=ax.transAxes + trans, **label_cfg)

    for axidx, ylabel in enumerate(ylabels):
        ax = axes[axidx, 0]
        ax2 = ax.twinx()

        line_color = colors[axidx]
        ax.plot(
            df.index,
            df[f'{ylabel}_r_mean'].values,
            lw=2, color=line_color, label=ylabel,

        )

        lighter_color = mcolors.to_rgba(line_color, alpha=0.1)

        ste = df[f'{ylabel}_r_std'] / np.sqrt(writing_days)
        ax.fill_between(df.index,
                        df[f'{ylabel}_r_mean'] - ste,
                        df[f'{ylabel}_r_mean'] + ste,
                        color=lighter_color,
                        )

        ax.axvline(
            x=gpt_release_date,
            color='red'
        )

        c_trend = 'gray'
        ax2.plot(
            df.index,
            df['trends'].values,
            c=c_trend,
            lw=2,
            label='ChatGPT\n trends',
            zorder=0,
        )
        if axidx == 0:
            ax2.legend(frameon=False, loc='upper left', fontsize='small')

        ax2.spines["right"].set_edgecolor(c_trend)
        ax2.tick_params(axis='y', colors=c_trend)

        ax.set_xlim(df.index[0],
                    df.index[-1])

        ax.set_ylabel(ylabel, c=line_color)
        if 'mean' in ylabel:
            s_name = 'mean'
        elif 'min' in ylabel:
            s_name = 'min'
        elif 'var' in ylabel:
            s_name = 'var'
        ax.set_ylabel(s_name+' of \n $\it{Binoculars}$', c=line_color)
        if axidx == 0:
            ax.annotate('The release of ChatGPT', color='red', xy=(
                gpt_release_date, 1.1),
                xycoords=('data', 'axes fraction'), ha='left', va='center')

        corr = stats.pearsonr(
            score_df_bypaper_daily_after['daily_trends'].values,
            score_df_bypaper_daily_after[ylabel].values,
        )
        ax_box = axes[axidx, 1]

        tfm = {
            0: 'Before',
            1: 'After',
        }
        score_df_bypaper_daily['aftergpt'] = score_df_bypaper_daily[
            'afterChatGPT'].apply(
            lambda x: tfm[x])

        sns.boxplot(data=score_df_bypaper_daily,
                    x='aftergpt',
                    y=ylabel,
                    ax=ax_box,
                    dodge=False,
                    showfliers=False,
                    color=line_color,
                    )
        stat, p = stats.mannwhitneyu(
            score_df_bypaper_daily.loc[
                score_df_bypaper_daily[
                    'afterChatGPT'] == 0][ylabel],
            score_df_bypaper_daily.loc[
                score_df_bypaper_daily[
                    'afterChatGPT'] == 1][ylabel],
        )
        ax_box.set_title(f'{p2mark(p)}')

        ax_sc = axes[axidx, 2]
        lower_bound = score_df_bypaper_daily_after[ylabel].quantile(0.001)
        upper_bound = score_df_bypaper_daily_after[ylabel].quantile(0.999)
        df_filtered = score_df_bypaper_daily_after[(score_df_bypaper_daily_after[ylabel] >= lower_bound) & (
            score_df_bypaper_daily_after[ylabel] <= upper_bound)]

        sns.regplot(data=df_filtered,
                    x='daily_trends', y=ylabel, ax=ax_sc,
                    line_kws=dict(color=line_color,),
                    scatter_kws={'alpha': 0.1, 's': 10, 'color': line_color,
                                 }
                    )

        ax_sc.set_title(f'{p2mark(corr.pvalue)}')
        ax_sc.set_xlabel('')
        ax_box.set_xlabel('')
        ax_sc.set_ylabel('')
        ax_box.set_ylabel('')

        if axidx == 2:
            ax_sc.set_xlabel('Daily trends\n(After release)')
            ax_box.set_xlabel('ChatGPT\nrelease')
        else:
            ax.set_xticklabels([])
            ax_box.set_xticklabels([])
            ax_sc.set_xticklabels([])

        ax.tick_params(axis='x', labelrotation=90)
    fig.tight_layout()
    fig.savefig(final_figdir / f'{figname}.png', dpi=400,
                bbox_inches='tight', transparent=True)
    fig.savefig(final_figdir / f'{figname}.pdf', dpi=400,
                bbox_inches='tight')
    return fig


fig = plot_fig14all(
    paperinfo_df,
    gpt_release_date,
    daily_trend_df,
    trend_df,
    figname='fig1_allpreprints'
)
plt.show()

In [None]:
published_paperinfo_df.shape

In [None]:
fig = plot_fig1(
    published_paperinfo_df,
    gpt_release_date,
    daily_trend_df,
    trend_df,
    figname='fig1_published'
)
plt.show()

In [None]:
def breakax_ax2(ax, ax2):
    ax.spines['bottom'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    legend = ax.get_legend()
    legend.set_title('Mean of Binoculars')
    legend.set_bbox_to_anchor((1, 1))
    legend.set_frame_on(False)
    fontsize = '8'
    legend.get_title().set_fontsize(fontsize)
    plt.setp(legend.get_texts(), fontsize=fontsize)

    legend = ax2.get_legend()
    legend.set_title('Min of Binoculars')
    legend.set_bbox_to_anchor((1, 1))
    legend.set_frame_on(False)
    legend.get_title().set_fontsize(fontsize)
    plt.setp(legend.get_texts(), fontsize=fontsize)

    ax.set_xlabel('')
    ax.set_ylabel('')
    ax2.set_ylabel('')
    ax.xaxis.tick_top()
    ax.tick_params(labeltop=False)
    ax2.xaxis.tick_bottom()
    d = .015

    kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
    ax.plot((-d, +d), (-d, +d), **kwargs)
    ax.plot((1 - d, 1 + d), (-d, +d), **kwargs)
    kwargs.update(transform=ax2.transAxes)
    ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)
    ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)


def plot_fig_cr_lang(
    paperinfo_df,
    daily_trend_df,
    gpt_release_date,
    fields_all,
    filed_names,
    writing_days,
    figname='fig2_cr_lang',
):
    annotator_cfg = dict(test='Mann-Whitney', text_format='star', loc='inside')
    score_df_bypaper_daily_cats = []
    for fidx, cr in enumerate(fields_all):
        paperinfo_df_field = paperinfo_df.loc[paperinfo_df['fields'].apply(
            lambda x: cr in x)]
        print(cr, paperinfo_df_field.shape)

        score_df_bypaper_daily_cat = paperinfo_df_field.groupby('date').agg({
            'bino_score_min': 'mean',
            'bino_score_mean': 'mean',
            'bino_score_var': 'mean',
        }).reset_index().dropna().set_index('date')

        score_df_bypaper_daily_cat['afterChatGPT'] = (
            score_df_bypaper_daily_cat.index >= gpt_release_date).astype(int)

        for col in score_df_bypaper_daily_cat.columns:
            if 'bino_score' in col:
                score_df_bypaper_daily_cat[f'{col}_r_mean'] = score_df_bypaper_daily_cat[col].shift(
                    -(writing_days-1)).rolling(writing_days).mean()
                score_df_bypaper_daily_cat[f'{col}_r_std'] = score_df_bypaper_daily_cat[col].shift(
                    -(writing_days-1)).rolling(writing_days).std()

        score_df_bypaper_daily_cat.dropna(inplace=True)
        tfm = {
            0: 'Before ChatGPT',
            1: 'After ChatGPT',
        }
        score_df_bypaper_daily_cat['aftergpt'] = score_df_bypaper_daily_cat[
            'afterChatGPT'].apply(
            lambda x: tfm[x])
        score_df_bypaper_daily_cat['domain'] = filed_names[cr]
        score_df_bypaper_daily_cats.append(score_df_bypaper_daily_cat.copy())
    score_df_bypaper_daily_cats = pd.concat(
        score_df_bypaper_daily_cats, axis=0)

    plt.close()
    plt.clf()
    label_cfg = dict(fontsize='large', va='bottom',  fontweight='bold')
    fig = plt.figure(figsize=(8, 5*2))
    trans = mtransforms.ScaledTranslation(-20/72, 7/72, fig.dpi_scale_trans)
    height_ratios = [1, 1, 0.5, 1, 1, 0.5, 2]
    hspace = 0.15
    wspace = .3
    gs1 = GridSpec(len(height_ratios), 1,
                   height_ratios=height_ratios,
                   wspace=wspace,
                   hspace=hspace
                   )
    width_ratios = [1, 1, 1]
    gs2 = GridSpec(len(height_ratios), len(width_ratios),
                   width_ratios=width_ratios,
                   height_ratios=height_ratios, wspace=wspace,
                   hspace=hspace
                   )

    ax = fig.add_subplot(gs1[0])
    ax.text(0.0, 1.0, 'a', transform=ax.transAxes + trans, **label_cfg)
    ylabel = 'bino_score_mean_r_mean'
    sns.boxplot(data=score_df_bypaper_daily_cats,
                x='domain', y=ylabel,
                hue='aftergpt',
                ax=ax,
                showfliers=False,
                palette="Set3",
                )

    pairs = [
        (
            (domain, tfm[0]),
            (domain, tfm[1]),
        ) for domain in score_df_bypaper_daily_cats['domain'].unique()
    ]
    annotator = Annotator(ax, pairs,
                          data=score_df_bypaper_daily_cats,
                          x='domain',
                          y=ylabel,
                          hue='aftergpt',
                          )
    annotator.configure(**annotator_cfg)
    annotator.apply_and_annotate()

    ax2 = fig.add_subplot(gs1[1])
    ylabel = 'bino_score_min_r_mean'
    sns.boxplot(data=score_df_bypaper_daily_cats,
                x='domain', y=ylabel,
                hue='aftergpt',
                ax=ax2,
                showfliers=False,
                )
    pairs = [
        (
            (domain, tfm[0]),
            (domain, tfm[1]),
        ) for domain in score_df_bypaper_daily_cats['domain'].unique()
    ]
    annotator = Annotator(ax2, pairs,
                          data=score_df_bypaper_daily_cats,
                          x='domain',
                          y=ylabel,
                          hue='aftergpt',
                          )
    annotator.configure(**annotator_cfg)
    annotator.apply_and_annotate()

    breakax_ax2(ax, ax2)
    ax2.set_xlabel('Domains')

    topk_countries = 9
    country_regions = paperinfo_df.reset_index().groupby('country_region').agg({
        '_id': 'size',
    }).reset_index().sort_values('_id', ascending=False).iloc[:topk_countries].country_region.values.tolist()
    if 'Others' in country_regions:
        country_regions.remove('Others')

    unique_crs = country_regions + ['Others']
    paperinfo_df.loc[~paperinfo_df['country_region'].isin(
        country_regions), 'country_region'] = 'Others'

    score_df_bypaper_daily_crs = []
    for fidx, cr in enumerate(unique_crs):
        paperinfo_df_cr = paperinfo_df.loc[paperinfo_df['country_region'] == cr]
        print(cr, paperinfo_df_cr.shape)

        score_df_bypaper_daily_cr = paperinfo_df_cr.groupby('date').agg({
            'bino_score_min': 'mean',
            'bino_score_mean': 'mean',
            'bino_score_var': 'mean',
        }).reset_index().dropna().set_index('date')
        score_df_bypaper_daily_cr['afterChatGPT'] = (
            score_df_bypaper_daily_cr.index >= gpt_release_date).astype(int)

        for col in score_df_bypaper_daily_cr.columns:
            if 'bino_score' in col:
                score_df_bypaper_daily_cr[f'{col}_r_mean'] = score_df_bypaper_daily_cr[col].shift(
                    -(writing_days-1)).rolling(writing_days).mean()
                score_df_bypaper_daily_cr[f'{col}_r_std'] = score_df_bypaper_daily_cr[col].shift(
                    -(writing_days-1)).rolling(writing_days).std()

        score_df_bypaper_daily_cr['cr'] = cr
        score_df_bypaper_daily_cr.dropna(inplace=True)
        score_df_bypaper_daily_cr['aftergpt'] = score_df_bypaper_daily_cr[
            'afterChatGPT'].apply(
            lambda x: tfm[x])

        score_df_bypaper_daily_crs.append(score_df_bypaper_daily_cr.copy())
    score_df_bypaper_daily_crs = pd.concat(
        score_df_bypaper_daily_crs, axis=0
    )

    ax = fig.add_subplot(gs1[3])
    ax.text(0.0, 1.0, 'b', transform=ax.transAxes + trans, **label_cfg)
    ylabel = 'bino_score_mean_r_mean'
    sns.boxplot(data=score_df_bypaper_daily_crs,
                x='cr', y=ylabel,
                hue='aftergpt',
                ax=ax,
                showfliers=False,
                palette="Set3",
                )
    pairs = [
        (
            (cr, tfm[0]),
            (cr, tfm[1]),
        ) for cr in score_df_bypaper_daily_crs['cr'].unique()
    ]
    annotator = Annotator(ax, pairs,
                          data=score_df_bypaper_daily_crs,
                          x='cr',
                          y=ylabel,
                          hue='aftergpt',
                          )
    annotator.configure(**annotator_cfg)
    annotator.apply_and_annotate()

    ax2 = fig.add_subplot(gs1[4])
    ylabel = 'bino_score_min_r_mean'
    sns.boxplot(data=score_df_bypaper_daily_crs,
                x='cr', y=ylabel,
                hue='aftergpt',
                ax=ax2,
                showfliers=False,
                )
    pairs = [
        (
            (cr, tfm[0]),
            (cr, tfm[1]),
        ) for cr in score_df_bypaper_daily_crs['cr'].unique()
    ]
    annotator = Annotator(ax2, pairs,
                          data=score_df_bypaper_daily_crs,
                          x='cr',
                          y=ylabel,
                          hue='aftergpt',
                          )
    annotator.configure(**annotator_cfg)
    annotator.apply_and_annotate()

    breakax_ax2(ax, ax2)
    ax2.set_xlabel('Country/regions')
    ax.legend_.remove()
    ax2.legend_.remove()


    score_df_bypaper_lang = []
    paperinfo_df['has_en'] = paperinfo_df['country_region'].apply(
        cr_has_en)
    for has_en in sorted(paperinfo_df['has_en'].unique()):
        print(has_en)
        df_tmp = paperinfo_df.loc[paperinfo_df['has_en'] == has_en]
        score_df_bypaper_day = df_tmp.groupby(df_tmp['date']).agg(
            {
                'bino_score_min': 'mean',
                'bino_score_mean': 'mean',

                'afterChatGPT': 'min'
            }
        ).reset_index()
        score_df_bypaper_day.sort_values('date', inplace=True)
        score_df_bypaper_day.set_index('date', inplace=True)

        score_df_bypaper_day = pd.concat(
            [score_df_bypaper_day, daily_trend_df],
            axis=1, join='inner',
        ).dropna()

        score_df_bypaper_lang.append(score_df_bypaper_day.copy())
        score_df_bypaper_lang[-1]['has_en'] = has_en_mapping[has_en]
    score_df_bypaper_lang_df = pd.concat(score_df_bypaper_lang, axis=0)
    score_df_bypaper_lang_df['aftergpt'] = score_df_bypaper_lang_df[
        'afterChatGPT'].apply(
        lambda x: tfm[x])

    ax = fig.add_subplot(gs2[6, 0])
    ax.text(0.0, 1.0, 'c', transform=ax.transAxes + trans, **label_cfg)
    ylabel = 'bino_score_min'
    sns.boxplot(
        data=score_df_bypaper_lang_df,
        x='has_en',
        y=ylabel,
        hue='aftergpt',
        showfliers=False,
        ax=ax)

    pairs = [
        (
            (has_en_mapping[0], "After ChatGPT"),
            (has_en_mapping[0], "Before ChatGPT"),
        ),
        (
            (has_en_mapping[1], "After ChatGPT"),
            (has_en_mapping[1], "Before ChatGPT"),
        ),
        (
            (has_en_mapping[0], "After ChatGPT"),
            (has_en_mapping[1], "After ChatGPT"),
        ),
        (
            (has_en_mapping[0], "Before ChatGPT"),
            (has_en_mapping[1], "Before ChatGPT"),
        ),

    ]
    annotator = Annotator(ax, pairs,
                          data=score_df_bypaper_lang_df,
                          x='has_en',
                          y=ylabel,
                          hue='aftergpt',
                          )
    annotator.configure(**annotator_cfg)
    annotator.apply_and_annotate()

    ax2 = fig.add_subplot(gs2[6, 1])
    ylabel = 'bino_score_mean'
    sns.boxplot(
        data=score_df_bypaper_lang_df,
        x='has_en',
        y=ylabel,
        hue='aftergpt',
        showfliers=False,
        palette="Set3",
        ax=ax2)
    annotator = Annotator(ax2, pairs,
                          data=score_df_bypaper_lang_df,
                          x='has_en',
                          y=ylabel,
                          hue='aftergpt',
                          )
    annotator.configure(**annotator_cfg)
    annotator.apply_and_annotate()

    ax.legend_.remove()
    ax2.legend_.remove()
    ax.set_ylabel('')
    ax2.set_ylabel('')
    ax.set_xlabel('')
    ax2.set_xlabel('')


    fig.savefig(final_figdir / f'{figname}.png', dpi=400,
                bbox_inches='tight', transparent=True)
    fig.savefig(final_figdir / f'{figname}.pdf', dpi=400,
                bbox_inches='tight')
    return fig


fig = plot_fig_cr_lang(
    paperinfo_df,
    daily_trend_df,
    gpt_release_date,
    fields_all,
    filed_names,

    writing_days,
    figname='fig2_cr_lang',
)
plt.show()

### Content type

In [None]:
def plot_content_types(
        df: pd.DataFrame,
        final_figdir,
        figname='fig_ct',
        text_max_length=30,
        dpi=400):
    df['afterChatGPT'] = (
        df['date'] >= gpt_release_date).astype(int)
    bino_mean = df['bino_score'].mean()
    print(bino_mean)
    df['bino_score_high'] = df['bino_score'] > bino_mean

    ct_low = df['content_type'].loc[
        df['bino_score_high'] == 0
    ].value_counts(normalize=True).sort_values()


    ct_ord = ct_low.index

    ct_high = df['content_type'].loc[
        df['bino_score_high'] == 1
    ].value_counts(normalize=True).reindex(ct_ord)

    ct_before = df.loc[
        df['afterChatGPT'] == 0
    ].groupby('content_type').agg({'bino_score': 'mean'}).sort_values('bino_score')['bino_score'].reindex(ct_ord)


    ct_after = df.loc[
        df['afterChatGPT'] == 1
    ].groupby('content_type').agg({'bino_score': 'mean'})['bino_score'].reindex(ct_ord)

    ctdiff = (ct_before - ct_after).sort_values(ascending=False)
    ct_ord = ctdiff.index
    print(ct_ord)

    ct_low = ct_low.reindex(ct_ord)
    ct_high = ct_high.reindex(ct_ord)
    ct_before = ct_before.reindex(ct_ord)
    ct_after = ct_after.reindex(ct_ord)

    ctpvals = []
    for ct in ct_ord:
        stat, p = stats.mannwhitneyu(
            df.loc[(df['afterChatGPT'] == 0) & (
                df['content_type'] == ct)]['bino_score'].values,
            df.loc[(df['afterChatGPT'] == 1) & (
                df['content_type'] == ct)]['bino_score'].values,
        )
        ctpvals.append(p2mark(p))
        print(ct, f'p={p}')

    wrapped_label_BA = [
        '\n'.join(wrap(l.capitalize(), text_max_length)) for l in ct_ord]

    wrapped_label_LH = [
        '\n'.join(wrap(l.capitalize(), text_max_length)) for l in ct_ord]

    fig, axes = plt.subplots(
        1, 2, figsize=[6.5, 3], sharey=True)

    ax = axes[0]
    s = 50
    bms = f"{bino_mean:.2f}"
    ax.scatter(ct_high.values, wrapped_label_LH,
               label='$\it{Binoculars}>$'+bms,
               marker='D', s=s)
    ax.scatter(ct_low.values, wrapped_label_LH,
               label='$\it{Binoculars}\leq$'+bms,
               marker='D', s=s)
    ax.hlines(y=wrapped_label_LH,
              xmin=ct_high.values,
              xmax=ct_low.values, color='#666666')
    ax.legend(frameon=False, loc=(0, 1.05))
    vals = ax.get_xticks()
    ax.set_xticklabels(['{:,.0%}'.format(x) for x in vals])
    ax.set_xlabel('Percentage in Collection')

    ax = axes[1]
    ax.scatter(ct_before.values, wrapped_label_BA,
               label='Before ChatGPT release', s=s)
    ax.scatter(ct_after.values, wrapped_label_BA,
               label='After ChatGPT release', s=s)
    ax.hlines(y=wrapped_label_BA,
              xmin=ct_before.values,
              xmax=ct_after.values, color='#666666')
    bn_diff = (ct_before.values + ct_after.values)/2
    for bnc,  pv, lbl in zip(bn_diff, ctpvals, wrapped_label_BA):
        if pv == '':
            continue
        ax.annotate(pv, xy=(bnc, lbl),
                    ha='center',
                    )

    ax.legend(frameon=False, loc=(0, 1.05))
    ax.set_xlabel('Average Binoculars')

    fig.tight_layout()
    fig.savefig(final_figdir / f'{figname}.png', dpi=400,
                bbox_inches='tight', transparent=True)
    fig.savefig(final_figdir / f'{figname}.pdf', dpi=400,
                bbox_inches='tight')
    return fig


dfc = read_sdfs_raw()

fig = plot_content_types(
    dfc,
    final_figdir,
    figname='fig_ct',
    text_max_length=30,
    dpi=400)
plt.show()

### Citations and Binoculars scores

In [None]:
def plot_fig3(
    df_impact,
    gpt_release_date,
    final_figdir,
    figname='fig2',
    bins=10, alpha=0.15,
    hspace=2.0,
):
    label_cfg = dict(fontsize='large', va='bottom',  fontweight='bold')
    stat_res = []
    df_impact_before = df_impact.loc[
        df_impact['afterChatGPT'] == 0
    ]
    df_impact_after = df_impact.loc[
        df_impact['afterChatGPT'] == 1
    ]
    key = 'bino_score_mean'

    corr_left, p_left = stats.pearsonr(
        df_impact_before[key],
        df_impact_before['influence_measure']
    )
    corr_right, p_right = stats.pearsonr(
        df_impact_after[key],
        df_impact_after['influence_measure']
    )
    corr_diff, pv = compare_correlations(len(df_impact_before), corr_left,
                                         len(df_impact_after), corr_right)

    stat_res.append((
        gpt_release_date, corr_left, p_left, corr_right, p_right, corr_diff, pv
    ))

    stat_res = pd.DataFrame(
        stat_res, columns=['date',
                           'corr_left', 'pv_left',
                           'corr_right', 'pv_right',
                           'stat', 'p-value']).set_index('date')

    print(stat_res)
    stat_resbym = []
    monthpoints = pd.date_range(
        start='2021-12-31', end='2024-03-01', freq='30D')
    for p1, p2 in zip(monthpoints[:-1], monthpoints[1:]):
        dfi = df_impact.loc[
            (df_impact['date'] >= p1) &
            (df_impact['date'] < p2)].copy()
        corr, pv = stats.pearsonr(
            dfi['bino_score_mean'],
            dfi['influence_measure']
        )
        stat_resbym.append((
            p1, corr, pv,
        ))

    stat_resbym = pd.DataFrame(
        stat_resbym, columns=['date',
                              'corr', 'pv',
                              ]).set_index('date')
    stat_resbym['afterChatGPT'] = stat_resbym.index >= gpt_release_date

    fig = plt.figure(figsize=(9, 6))
    
    trans = mtransforms.ScaledTranslation(-20/72, 7/72, fig.dpi_scale_trans)
    gs1 = GridSpec(2, 2, width_ratios=[1, 3])
    gs2 = GridSpec(2, 2, width_ratios=[4, 1], hspace=0.5)

    ax1 = fig.add_subplot(gs1[0, 0])
    ax = ax1
    df_impact_before['citations'].hist(ax=ax, bins=bins, label='before')
    df_impact_after['citations'].hist(ax=ax, bins=bins, label='after')
    ax.set_yscale('log')
    ax.legend(frameon=False)
    ax.set_ylabel('Counts')
    ax.set_xlabel('Citations')
    ax.text(0.0, 1.0, 'a', transform=ax.transAxes + trans, **label_cfg)

    ax2 = fig.add_subplot(gs1[0, 1])
    ax = ax2
    df_impact.groupby('date').agg({
        'influence_measure': 'mean'
    }).plot(ax=ax)
    ax.legend_.remove()
    ax.set_ylabel('Citations (avg)')
    ax.set_xlabel('')
    ax.tick_params(axis='x', labelrotation=90)
    ax.text(0.0, 1.0, 'b', transform=ax.transAxes + trans, **label_cfg)

    ax3 = fig.add_subplot(gs2[1, 0])
    ax = ax3
    ax.plot(
        stat_resbym.index,
        stat_resbym['corr'],
        'o-',
        color='dimgray'
    )

    print(ax.get_ylim()[1])
    ax.axvline(
        x=gpt_release_date,
        color='red'
    )
    ax.tick_params(axis='x', labelrotation=90)
    ax.annotate('The release of ChatGPT', color='red', xy=(
        gpt_release_date, 1.1),
        xycoords=('data', 'axes fraction'), ha='left', va='center')
    ax.set_ylabel('Corr. between\nBinoculars mean\n&\ncitations')
    ax.legend(loc='upper left', bbox_to_anchor=(0, 1.5), frameon=False)
    ax.text(0.0, 1.0, 'c', transform=ax.transAxes + trans, **label_cfg)

    ax4 = fig.add_subplot(gs2[1, 1], sharey=ax3)
    ax = ax4
    tfm = {
        0: 'Before',
        1: 'After',
    }
    stat_resbym['aftergpt'] = stat_resbym[
        'afterChatGPT'].apply(
        lambda x: tfm[x])

    ylabel = 'corr'
    snsFig = sns.boxplot(data=stat_resbym,
                         x='aftergpt',
                         y=ylabel,
                         ax=ax,
                         dodge=False,
                         showfliers=False,
                         zorder=100,
                         )

    stat, p = stats.mannwhitneyu(
        stat_resbym.loc[
            stat_resbym[
                'afterChatGPT'] == 0][ylabel],
        stat_resbym.loc[
            stat_resbym[
                'afterChatGPT'] == 1][ylabel],
        alternative='greater'
    )

    ax.set_ylabel('')
    ax.set_title(f'{p2mark(p)}')
    ax.set_xlabel('ChatGPT\nrelease')

    ax3.set_ylim(stat_resbym['corr'].min()*1.3, stat_resbym['corr'].max()*1.3)
    for axi in (ax3, ax4):
        axi.axhspan(0, axi.get_ylim()[1], color='lightgreen',
                    edgecolor=None,
                    alpha=alpha,
                    label='Positive: LLM-generated↑, Binoculars↓, Citations↓')
        axi.axhspan(axi.get_ylim()[0], 0, color='orange',
                    edgecolor=None,
                    alpha=alpha,
                    label='Negative: LLM-generated↑, Binoculars↓, Citations↑')
    ax3.legend(loc='upper left', bbox_to_anchor=(0, 1.5), frameon=False)

    fig.subplots_adjust(hspace=hspace)
    fig.savefig(final_figdir / f'{figname}.png', dpi=400,
                bbox_inches='tight', transparent=True)
    fig.savefig(final_figdir / f'{figname}.pdf', dpi=400,
                bbox_inches='tight')
    return fig




In [None]:
fig2 = plot_fig3(
    df_impact,
    gpt_release_date,
    final_figdir,
    figname='fig_impact',
    bins=10,
    hspace=2.5
)
plt.show()

In [None]:
fig2 = plot_fig3(
    df_impact_published,
    gpt_release_date,
    final_figdir,
    figname='fig2_published',
    bins=10,
    hspace=2.5
)
plt.show()