In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import joblib
from sklearn.pipeline import Pipeline
import sklearn.linear_model as lm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import scipy.stats as stats
from sklearn.svm import SVC
import sklearn.metrics as mt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import scipy.stats as stats
import json
categories = [
    "phenomenon description",
    "hypothesis formulation",
    "methodology explanation",
    "data presentation",
    "logical deduction",
    "result interpretation",
    "literature review",
    "comparative analysis",
    "conclusion summarization",
    "future work suggestions",

    ]
categories_mapping = {}
for c_idx, c in enumerate(categories):
    cx = np.zeros(len(categories))
    cx[c_idx] = 1.0
    categories_mapping[c] = cx
categories_mapping
final_figdir=Path('/path/to/Downloads')
gpt_release_date = pd.to_datetime('2022-11-30')

In [None]:
def trend_slope(x):
    y = np.array(x)
    X = np.arange(len(x)).reshape(-1, 1)
    model = lm.LinearRegression().fit(X, y)
    return model.coef_[0]


def rolling_difference_mean(x, window=2):
    diffs = np.diff(x, n=1)
    rolling_diffs = np.convolve(diffs, np.ones(window) / window, mode='valid')
    return np.mean(rolling_diffs)

def percentage_decreases(x):
    decreases = np.diff(x) < 0
    return np.sum(decreases) / len(decreases)

In [None]:
dfo = pd.read_pickle('data/content_type/modified_scored_df_gptzero.pkl')
dfo['bino_score'] = dfo['bino_score'].apply(lambda x:np.array(x))
dfo['content_type'] = dfo['content_type'].apply(lambda x: np.array([categories_mapping[c] for c in x]))

In [None]:
dfo.columns

In [None]:
df = dfo.copy()
df['bino_score_mean'] = df['bino_score'].apply(lambda x:np.mean(x))
df['bino_score_var'] = df['bino_score'].apply(lambda x:np.var(x))
df['bino_score_min'] = df['bino_score'].apply(lambda x:np.min(x))

df['gptz_mean'] = df['gptzerolabels'].apply(lambda x:np.mean(x))
df['gptz_var'] = df['gptzerolabels'].apply(lambda x:np.var(x))
df['gptz_min'] = df['gptzerolabels'].apply(lambda x:np.min(x))



In [None]:
sns.set_theme(style="ticks")
feat_cols_all = {
    'GPTZero': ['gptz_mean', 'gptz_var', 'gptz_min'],
    'Binoculars': [
        'bino_score_min',
        'bino_score_mean',
        'bino_score_var',
    ]
}
fig, axes = plt.subplots(
    1, 2, figsize=[6, 3],
    sharex=True, sharey=True)
for axidx, (model_name, feat_cols) in enumerate(feat_cols_all.items()):

    for col in feat_cols:
        res = stats.pearsonr(
            df['modified_bygpt'],
            df[col]
        )
        print(
            f'correlation between {col} and is_modified_by_ChatGPT: corr={res.statistic:.3f} pv={res.pvalue:.3f}')

    X = df[feat_cols].values
    y = df['modified_bygpt'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8,
                                                        random_state=2022)

    pipe = Pipeline([
        ('svc', lm.LogisticRegression(C=2.))
    ])
    pipe.fit(X_train, y_train)
    y_score = pipe.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = mt.roc_curve(y_test, y_score)
    roc_auc = mt.auc(fpr, tpr)
    ax = axes[axidx]
    ax.plot(fpr, tpr, color='darkorange', lw=2,
            label='ROC(area=%0.2f)' % roc_auc)
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax.set_title(model_name)
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.set_aspect('equal')
    ax.legend(loc="lower right", frameon=False)


fig.savefig(final_figdir / 'gptzero_binocualrs_comparsion.pdf', dpi=400,
            bbox_inches='tight', transparent=True)
plt.show()

In [None]:

dir = Path('data/')
paperinfo_df = pd.read_hdf(dir / 'allpaperinfo.hdf5')
paperinfo_df = paperinfo_df.loc[paperinfo_df['country_region']!='None'].copy()
paperinfo_df.head(3)

In [None]:
category_mapping = json.load(open(dir / 'category_mapping.json'))
fields_all = sorted(set(category_mapping.values()))
fields_all.remove('Interdisciplinary')  # sample size too small
filed_names = {
    "Biological Sciences": "BioS",
    "Computer Science": "CompS",
    "Economics and Finance": "EcoF",
    "Engineering": "Eng",
    "Environmental Sciences": "EnvS",
    "Mathematical Sciences": "Math",
    "Medicine": "Med",
    "Neurosciences": "Neur",
    "Physical Sciences": "PhyS",
}


In [None]:
paperinfo_df['bino_score_min2mean'] = paperinfo_df['bino_score_min'] / paperinfo_df['bino_score_mean'] - 1
paperinfo_df['bino_score_min2var'] = paperinfo_df['bino_score_min'] / paperinfo_df['bino_score_var'] - 1
paperinfo_df['afterChatGPT'] = (paperinfo_df['date'] > gpt_release_date).astype(int)

paperinfo_df['domains'] = paperinfo_df['category'].apply(
    lambda x: [category_mapping[c] for c in x.split(',')][0])

In [None]:
paperinfo_df['bygpt_pred'] = pipe.predict(paperinfo_df[feat_cols].values)
paperinfo_wcp_df = paperinfo_df.groupby('date').agg({
    'bygpt_pred':'mean',
    'afterChatGPT':'min'
})

writing_days = 30
paperinfo_wcp_df['bygpt_pred_ma'] = paperinfo_wcp_df['bygpt_pred'].shift(
            -(writing_days-1)).rolling(writing_days).mean()
paperinfo_wcp_df.dropna(inplace=True)

print(stats.pearsonr(
    paperinfo_df['afterChatGPT'].values,
    paperinfo_df['bygpt_pred'].values
))
fig, ax=plt.subplots(figsize=[6,3])
ax.plot(
    paperinfo_wcp_df.index,
    paperinfo_wcp_df['bygpt_pred'].values
)
plt.show()

In [None]:
bias = paperinfo_wcp_df.loc[paperinfo_wcp_df['afterChatGPT']==0]['bygpt_pred_ma'].quantile(0.99)

paperinfo_wcp_df['bygpt_pred_adjusted'] = np.clip(paperinfo_wcp_df['bygpt_pred_ma'].values-bias,0,None)
plt.clf()
sns.set_theme(style="ticks")
fig, axes=plt.subplots(1,2, figsize=[8,2.5],
                       gridspec_kw={'width_ratios': [1, 2]}
                      )
ax = axes[0]
ax.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_aspect(1)

ax = axes[1]
ax.plot(
    paperinfo_wcp_df.index,
    paperinfo_wcp_df['bygpt_pred_adjusted'].values*100,
    lw=2,
    label='adjusted'
)
ax.tick_params(axis='x', labelrotation=90)
index_label = 'AI-content Index (%)'
ax.set_ylabel(index_label)
fig.tight_layout()
fig.savefig(final_figdir / 'preds.png', dpi=400,
            bbox_inches='tight', transparent=True)
plt.show()

In [None]:

joblib.dump(pipe, 'data/ai_index_pipe.pkl')