In [25]:
# basic
import sys
import os
import json
import requests
from tqdm import tqdm
import ast
import numpy as np
import pandas as pd
import datetime
from collections import defaultdict
from sklearn.model_selection import train_test_split
# debug
import pdb
from loguru import logger

# custom
from parser import work_parser, author_parser, venue_parser, institution_parser
from scraper import oa_work_scraper, oa_author_scraper


# basic
import json
import pdb
import ast
from tqdm import tqdm
from collections import Counter

import numpy as np
import pandas as pd

# plotting
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

# regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn.neural_network import MLPRegressor

# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning,  message='`np.bool` is a deprecated alias')
filterwarnings(action='ignore', category=DeprecationWarning,  message='`np.int` is a deprecated alias' )
filterwarnings(action='ignore', category=DeprecationWarning,  message='`np.object` is a deprecated alias')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import namedtuple
from nltk.tokenize import word_tokenize
import time

SEED = 0

In [2]:
input_types = ['works', 'authors', 'insts', 'venues']

data = {}
for input_type in input_types:
    DATA_FILE = './data/data.' + input_type + '.2012.v1.txt'
    with open(DATA_FILE, 'r') as f:
        input_json = [ast.literal_eval(work) for work in tqdm(f.readlines())]
        input_dict = [j for i in input_json for j in i]
        data[input_type] = pd.DataFrame.from_dict(input_dict).set_index('id')
#         print(data[input_type].head())

100%|██████████| 823/823 [02:16<00:00,  6.02it/s]
100%|██████████| 5653/5653 [03:20<00:00, 28.19it/s]
100%|██████████| 241/241 [00:14<00:00, 16.58it/s]
100%|██████████| 35/35 [00:01<00:00, 18.34it/s]


In [3]:
for k in data:
    print(k)
    print(data[k].columns)
    print("=====================")

works
Index(['doi', 'title', 'type', 'publication_date', 'host_venue',
       'open_access_is_oa', 'open_access_oa_status', 'authorships',
       'page_count', 'cited_by_count', 'concepts', 'referenced_works',
       'abstract', 'counts_by_year'],
      dtype='object')
authors
Index(['orchid', 'display_name', 'works_count', 'cited_by_count',
       'created_date', 'concepts', 'counts_by_year'],
      dtype='object')
insts
Index(['display_name', 'country_code', 'type', 'homepage_url', 'works_count',
       'cited_by_count', 'associated_institutions', 'concepts',
       'counts_by_year', 'created_date'],
      dtype='object')
venues
Index(['display_name', 'works_count', 'cited_by_count', 'is_oa', 'type',
       'created_date', 'concepts', 'counts_by_year'],
      dtype='object')


In [4]:
df = pd.DataFrame(data['works'])
# df=df[:1000]
df['no_of_authors'] = df['authorships'].map(lambda x: len(x))
df['no_of_referenced_works'] = df['referenced_works'].map(len)
# FEATURES
# df['yearly_citation_count']=pd.DataFrame(df['cited_by_count']/(pd.to_datetime(df['publication_date']).rsub(pd.to_datetime('2022-10-31')).dt.days/365.25)).round()

#(2) Journal and Publisher relevant features
venues_data = data['venues']
venue_significance =defaultdict(lambda: 0,  (venues_data['cited_by_count']/venues_data['works_count']).to_dict())
df['venue_significance'] = df['host_venue'].map(lambda x: venue_significance[x])
# venue_works =venues_data['works_count'].to_dict()
venue_works =defaultdict(lambda: 0,venues_data['works_count'].to_dict())
df['venue_works'] = df['host_venue'].map(lambda x: venue_works[x])
# venue_citations =venues_data['cited_by_count'].to_dict()
venue_citations =defaultdict(lambda: 0,venues_data['cited_by_count'].to_dict())
df['venue_citations'] = df['host_venue'].map(lambda x: venue_citations[x])

# (3)Author-specific Features
# barcc = df['no_of_referenced_works'].mean()
thresh_author_citation_prominent = data['authors']['cited_by_count'].mean()
# author_citation = data['authors']["cited_by_count"].to_dict()
author_citation = defaultdict(lambda: 0, data['authors']["cited_by_count"].to_dict())
df['author_prominency'] = df['authorships'].map(lambda x: 1 if max([author_citation[i[0]] for i in x])-thresh_author_citation_prominent>=0 else 0)
df['authors_mean_citations'] = df['authorships'].map(lambda x: np.mean([author_citation[i[0]] for i in x]))

# author_work_count = data['authors']["works_count"].to_dict()
author_work_count = defaultdict(lambda: 0, data['authors']["works_count"].to_dict())
df['authors_mean_works'] = df['authorships'].map(lambda x: np.mean([author_work_count[i[0]] for i in x]))


# (7)Insti-specific Features
# insts_citation = data['insts']["cited_by_count"].to_dict()
insts_citation = defaultdict(lambda: 0, data['insts']["cited_by_count"].to_dict())
df['insts_mean_citations'] = df['authorships'].map(lambda x: np.mean([insts_citation[i[1][0]] for i in x if len(i[1])>0]))

# insts_work_count = data['insts']["works_count"].to_dict()
insts_work_count = defaultdict(lambda: 0, data['insts']["works_count"].to_dict())
df['insts_mean_works'] = df['authorships'].map(lambda x: np.mean([insts_work_count[i[1][0]] for i in x if len(i[1])>0]))


#(4)Page_count AA: a lot are -1 in the datase. The feature might be not so relevant
df['page_count']=df['page_count'].map(lambda x: x if isinstance(x, int ) else x[0])

#(5) Publication month
df['publication_month'] = df['publication_date'].map(lambda x: int(x.split('-')[1]))


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [5]:
df_new = df.drop(columns=['open_access_oa_status','doi', 'title', 'type','publication_date','host_venue','authorships', 'referenced_works', 'concepts','counts_by_year'])
df_new = df_new.fillna(0)
X_train, X_test = train_test_split(df_new, test_size=0.2)
np.save(open('train_id.npy', 'wb'), np.array(list(X_train.index)))
np.save(open('test_id.npy', 'wb'), np.array(list(X_test.index)))
# aa=np.load('test.npy')#, 'r')

# (6) Textual features
t=time.time()
docs = list(X_train['abstract'])
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(docs)]
model = Doc2Vec(tagged_data, vector_size = 20, window = 3, min_count = 1, epochs = 10, workers = 4)
model.save("Abstract_doc2Vec")
# model2 = model.load("Abstract_doc2Vec")
print(time.time()-t)
t=time.time()
X_train["abstract_vector"] = X_train["abstract"].map(lambda x: model.infer_vector(word_tokenize(x)))
X_test["abstract_vector"] = X_test["abstract"].map(lambda x: model.infer_vector(word_tokenize(x)))
columns = ["abstract_feature_"+str(i) for i in range(1,21)]
X_train[columns] = pd.DataFrame(X_train["abstract_vector"].tolist(), index= X_train.index)
X_test[columns] = pd.DataFrame(X_test["abstract_vector"].tolist(), index= X_test.index)
print(time.time()-t)

print(X_train.columns)

# target variable
y_train=X_train['cited_by_count']
y_test = X_test['cited_by_count']
X_train = X_train.drop(columns=['abstract', 'abstract_vector'])
X_test = X_test.drop(columns=['abstract','abstract_vector'])
X_train.to_csv("Training_data.csv")#, index=False)
X_test.to_csv("Test_data.csv")#, index=False)
X_train = X_train.drop(columns=['cited_by_count'])
X_test = X_test.drop(columns=['cited_by_count'])

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
SEED = 0

1179.4758884906769


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


849.8859283924103
Index(['open_access_is_oa', 'page_count', 'cited_by_count', 'abstract',
       'no_of_authors', 'no_of_referenced_works', 'venue_significance',
       'venue_works', 'venue_citations', 'author_prominency',
       'authors_mean_citations', 'authors_mean_works', 'insts_mean_citations',
       'insts_mean_works', 'publication_month', 'abstract_vector',
       'abstract_feature_1', 'abstract_feature_2', 'abstract_feature_3',
       'abstract_feature_4', 'abstract_feature_5', 'abstract_feature_6',
       'abstract_feature_7', 'abstract_feature_8', 'abstract_feature_9',
       'abstract_feature_10', 'abstract_feature_11', 'abstract_feature_12',
       'abstract_feature_13', 'abstract_feature_14', 'abstract_feature_15',
       'abstract_feature_16', 'abstract_feature_17', 'abstract_feature_18',
       'abstract_feature_19', 'abstract_feature_20'],
      dtype='object')
(102477, 33)
(102477,)
(25620, 33)
(25620,)


In [49]:
# regression analysis
# X_train = X_train.drop(columns=['open_access_oa_status'])
# X_test = X_test.drop(columns=['open_access_oa_status'])
# X_train = X_train.drop(columns=['abstract_vector'])
# X_test = X_test.drop(columns=['abstract_vector'])


def print_metric(metric_name, metric_list):
    mean, std = np.mean(metric_list), np.std(metric_list)
    print (f"  - {metric_name}: {np.round(mean, 3)} +/- {np.round(std, 3)}")
    return

reg_models = {
    'Linear Regression': LinearRegression(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
#     'SGDRegressor': SGDRegressor(),
    'MLPRegressor': MLPRegressor(alpha=0, random_state=SEED, hidden_layer_sizes=1,max_iter=100, learning_rate_init =0.0001),
    'XGBRegressor': XGBRegressor(),
}

for model_name, model in reg_models.items():
#     kf = KFold(n_splits=5, random_state=2, shuffle=True)
        train_rmse, test_rmse = [], []
        print (f"{model_name}:===========")
    
#     for train_index, test_index in kf.split(X):
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#         # normalize
#         scaler = StandardScaler()
#         X_train = scaler.fit_transform(X_train)
#         X_test = scaler.transform(X_test)

        model.fit(X_train,y_train)
        y_pred = model.predict(X_train)
        rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=y_pred))
#         rmse = mean_squared_error(y_true=y_train, y_pred=y_pred, squared=False)
        train_rmse.append(rmse)
        train_abs_error = mean_absolute_error(y_train, y_pred)

        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))
#         rmse = mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)
        test_rmse.append(rmse)
        test_abs_error = mean_absolute_error(y_test, y_pred)

        print_metric('Train RMSE', train_rmse)
        print_metric('Test RMSE', test_rmse)
        print("Train Mean absolute error: ", train_abs_error)
        print("Test Mean absolute error: ", test_abs_error)
        if model_name[:5]=="Linea":
            print(model.coef_)
            print(model.get_params())
            print(model.score(X_train,y_train))
            print(model.score(X_test,y_test))
        elif model_name[:3]=="SGD" :
            print(model.coef_)
            print(model.get_params())
            print(model.score(X_train,y_train))
            print(model.score(X_test,y_test))
        elif model_name[:4]=="Grad":#
            print(model.get_params())
            print(model.score(X_train,y_train))
            print(model.score(X_test,y_test))
        elif model_name[:3]=="XGB":
            print(model.get_params())
            print(model.score(X_train,y_train))
            print(model.score(X_test,y_test))
        else:
            print(model.get_params())
            print(model.score(X_train,y_train))
            print(model.score(X_test,y_test))


  - Train RMSE: 25.755 +/- 0.0
  - Test RMSE: 47.069 +/- 0.0
Train Mean absolute error:  7.867497940751983
Test Mean absolute error:  8.27016541579083
[ 4.22616493e+00 -8.73679535e-01  3.39464316e-01  1.64043378e-01
  8.87633418e-01  1.63614624e-05 -5.79082800e-06  2.85417181e+00
  1.35737100e-03 -2.62724760e-02  3.79743895e-07 -2.33656427e-05
 -4.82785240e-02  4.38302174e-01  2.53482784e-01  2.08403721e+00
 -1.07956290e+00 -2.37552127e+00 -1.95784353e-01  7.94044047e-01
 -2.03907194e+00 -2.02551605e+00 -2.83709809e+00 -1.44459208e+00
 -3.44047451e-01 -6.51918879e+00 -1.37184051e+00  9.03161866e-01
 -9.30952728e-02 -1.21290830e+00  2.35497603e+00  1.12841701e+00
  2.81649928e+00]
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}
0.1364806726872041
0.0591526263990334
  - Train RMSE: 22.442 +/- 0.0
  - Test RMSE: 46.805 +/- 0.0
Train Mean absolute error:  7.096495132209163
Test Mean absolute error:  7.728794565962442
{'alpha': 0.9, 'criterion': 'friedman_mse', '



  - Train RMSE: 28.097 +/- 0.0
  - Test RMSE: 48.786 +/- 0.0
Train Mean absolute error:  7.122628015130462
Test Mean absolute error:  7.511123345029008
{'activation': 'relu', 'alpha': 0, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': 1, 'learning_rate': 'constant', 'learning_rate_init': 0.0001, 'max_iter': 100, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 0, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
-0.027736915769789094
-0.010736355797493546
  - Train RMSE: 12.542 +/- 0.0
  - Test RMSE: 47.548 +/- 0.0
Train Mean absolute error:  5.6441358425818375
Test Mean absolute error:  7.839350211244021
{'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'early_stopping_rounds': None, 'en