# General Linear Model with  Log(c+1)

Using *General Linear Model*(LM_log) on the log(c+1) to estimate the effects of **National Multi-affiliated author(NM)**,**International Multi-affiliated author(IM)** on the citation count.

We also consider some factors which have been reported to be associated with citation counts as control variables. 
The *R-style* regression equation is expressed as
```R
Log(TC+1) ~ NM_mark + IM_mark + N_ins + N_c + N_refs + N_a
```
where

- NM_mark: 1 for having 1 or more NM authors, otherwise 0
- IM_mark: 1 for having 1 or more IM authors, otherwise 0
- N_ins: number of institutions
- N_c: number of countries
- N_refs: number of references
- N_a: number of authors*

*we only consider papers with no more than 10 authors.

In [12]:
# folders
project = 'MultipleAffiliations'
data_dir = f'D:/Data/{project}/data/'
result_dir = f'D:/Data/{project}/result/'
regression_result_dir = f"{result_dir}/LM_log/"

In [13]:
# import packages
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import glob
import os
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

In [14]:
# function pmark(c,p)  mark p-value
def pmark(c,p):
    if 0.01<=p<0.05:
        mark = '*'
    elif 0.001<=p<0.01:
        mark = '**'
    elif p<0.001:
        mark = '***'
    else:
        mark = ''
    return f'{c}{mark}'

In [15]:
# fitting function: fit_model(data,cate,country,IVs,reg_file)
def fit_model(data,cate,IVs,reg_file):
#    data = df[df['Subject']==cate]
    mod = smf.ols(f"LogTC ~ {'+ '.join(IVs)}", data)
    res = mod.fit()
    cate = cate.replace('/','&')
    with open(reg_file,'w') as fw:
        print(res.summary(),file=fw)
    return [cate] + [pmark(f"{res.params['Intercept']:.2f}",res.pvalues['Intercept'])] + [pmark(f"{res.params[x]:.2f}",res.pvalues[x]) for x in IVs] + [f"{res.rsquared_adj:.2f}"]+ [res.aic,res.bic]

## Example

take papers of immunology as example

In [16]:
# load data
df = pd.read_csv(f"{data_dir}/IIC_reg_19subject.csv")
df.columns = ['UT', 'TC', 'NM_mark', 'IM_mark', 'S_mark', 'N_refs', 'N_ins',
       'N_c', 'N_a', 'Subject']

In [17]:
df['LogTC'] = np.log(df['TC']+1)
for iv in ['NM','IM','S']:
    df[f'{iv}_mark'] = df[f'{iv}_mark'].apply(lambda x:1 if x=='Y' else 0)

In [18]:
cate = 'IMM'
data = df[df['Subject']==cate]
n = data.shape[0]
data = data[data['N_a']<=10] # we only consider papers with <= 10 authors
m = data.shape[0]
n,m

(18586, 13653)

### VIF test

Variance Inflation Factor is used to test the multicollinearity among independent variables.

In [19]:
IVs = ['NM_mark','IM_mark','N_refs','N_ins','N_c','N_a']
X = add_constant(data[IVs])
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif =[cate]+vif
df_vif = pd.DataFrame([vif], columns=['Category']+list(X.columns))
df_vif

Unnamed: 0,Category,const,NM_mark,IM_mark,N_refs,N_ins,N_c,N_a
0,IMM,20.164208,1.413633,1.023131,1.063238,2.174727,1.532999,1.212383


The variables are not highly correlated.

### Fitting

In [20]:
IVs = ['NM_mark','IM_mark','N_refs','N_ins','N_c','N_a']

In [21]:
mod = smf.ols(f"LogTC ~ {'+ '.join(IVs)}", data)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  LogTC   R-squared:                       0.121
Model:                            OLS   Adj. R-squared:                  0.120
Method:                 Least Squares   F-statistic:                     312.1
Date:                Tue, 14 Jan 2020   Prob (F-statistic):               0.00
Time:                        21:03:34   Log-Likelihood:                -17604.
No. Observations:               13653   AIC:                         3.522e+04
Df Residuals:                   13646   BIC:                         3.528e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.0834      0.034     32.080      0.0

## Across Disciplines

### Institutional Collaboration

In [22]:
# load data
df = pd.read_csv(f"{data_dir}/Factor_IC.csv")
df.columns = ['UT', 'TC', 'NM_mark', 'IM_mark', 'S_mark', 'N_refs', 'N_ins',
       'N_c', 'N_a', 'Subject']
df=df[df['N_a']<=10]

In [23]:
df['LogTC'] = np.log(df['TC']+1)

for iv in ['NM','IM','S']:
    df[f'{iv}_mark'] = df[f'{iv}_mark'].apply(lambda x:1 if x=='Y' else 0)

#### VIF test

In [24]:
# vif test
vifs = []
for cate,data in df.groupby('Subject'):    
    IVs = ['NM_mark','IM_mark','N_refs','N_ins','N_c','N_a']
    X = add_constant(data[IVs])
    vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vifs.append([cate]+vif)

df_vif = pd.DataFrame(vifs, columns=['Category']+list(X.columns))
df_vif

Unnamed: 0,Category,const,NM_mark,IM_mark,N_refs,N_ins,N_c,N_a
0,AGR,14.801898,1.111909,1.313493,1.025844,1.446552,1.565893,1.167522
1,BIO,15.305312,1.219432,1.380594,1.024301,1.56012,1.709745,1.134153
2,CHE,16.103251,1.184236,1.360849,1.011681,1.538391,1.743881,1.124487
3,CLI,13.305229,1.123049,1.286456,1.017947,1.434947,1.561606,1.116686
4,COM,14.254099,1.190514,1.211634,1.012692,1.713483,1.601197,1.207819
5,ENG,15.503698,1.140109,1.247712,1.020795,1.486757,1.540843,1.140294
6,ENV,11.609784,1.176832,1.374624,1.031189,1.873638,1.850714,1.270242
7,GEO,10.610943,1.194339,1.268281,1.044108,2.119532,1.811895,1.403498
8,IMM,15.026661,1.201426,1.313674,1.052732,1.606743,1.681364,1.172332
9,MATE,15.687966,1.162498,1.379841,1.029935,1.501608,1.730371,1.132541


In [25]:
excel = pd.ExcelWriter(f'{regression_result_dir}/DIS_VIF.xlsx')
df_vif.to_excel(excel,index=True)
excel.close()

#### Fitting

In [26]:
# fitting
IVs = ['NM_mark','IM_mark','N_refs','N_ins','N_c','N_a']
#cates = list(df['Subject'].unique())
models = [fit_model(data,cate,IVs,f"{regression_result_dir}/Discipline/{cate}.txt") for cate,data in df.groupby('Subject')]
cols = ['Subject','Intercept']+IVs+['R-Squared']+['AIC','BIC']
models = pd.DataFrame(models, columns=cols)

In [27]:
models.set_index('Subject',inplace=True)
idx = ['SPA','NEU','PSY','IMM','CLI','PHA','PHY','MOL','BIO','MIC','PLA','ENV','GEO','CHE','AGR','MATE','COM','ENG','MATH']
models.loc[idx]

Unnamed: 0_level_0,Intercept,NM_mark,IM_mark,N_refs,N_ins,N_c,N_a,R-Squared,AIC,BIC
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SPA,1.06***,-0.02,0.04**,0.01***,0.03***,0.04***,0.01***,0.17,65309.456183,65366.524185
NEU,0.81***,0.07***,0.04***,0.01***,-0.01***,0.11***,0.05***,0.14,195762.764419,195827.638157
PSY,0.46***,0.08***,0.03*,0.01***,-0.00,0.08***,0.06***,0.19,62803.157563,62860.383645
IMM,0.94***,0.08***,0.03*,0.01***,0.00,0.10***,0.02***,0.13,90934.851855,90994.176486
CLI,0.42***,0.08***,0.03***,0.01***,0.01***,0.13***,0.06***,0.15,956516.548891,956592.428207
PHA,0.84***,0.10***,0.04**,0.01***,-0.02***,0.11***,0.04***,0.13,153800.366738,153863.588235
PHY,0.60***,0.11***,0.11***,0.01***,-0.04***,0.17***,0.04***,0.15,427426.550567,427496.558915
MOL,0.82***,0.10***,0.07***,0.01***,-0.04***,0.11***,0.06***,0.15,147856.103757,147918.597527
BIO,0.93***,0.09***,0.05***,0.01***,-0.05***,0.14***,0.04***,0.11,270357.567158,270424.540843
MIC,0.82***,0.05***,0.01,0.01***,-0.01**,0.12***,0.04***,0.15,74490.278902,74548.538661


In [28]:
excel = pd.ExcelWriter(f'{regression_result_dir}/DIS.xlsx')
models.loc[idx].to_excel(excel,index=True)
excel.close()

## Across Countries and Disciplines

### Institutional Collaboration 

In [29]:
files = glob.glob(f"{data_dir}/country/*.csv")

#### Number of records

In [30]:
frames = []
for file in files:
    df = pd.read_csv(file)
    df.columns = ['UT', 'TC', 'DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark','ForeignIM_mark', 'N_refs', 'N_ins',
       'N_c', 'N_a', 'Subject']
    country = file.split('\\')[-1].split('.')[0]
    df['country'] = country
    frames.append(df)


In [31]:
df = pd.concat(frames)

In [32]:
df = df[df['N_a']<=10]
df = pd.pivot_table(df,values='UT',index='Subject',columns='country',aggfunc='count')
df.head(2)

country,BR,CA,CN,DE,FR,IN,IT,JP,RU,UK,US,ZA
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AGR,8215,2654,9921,3171,3367,3145,3441,2971,331,2645,11985,698
BIO,3494,4606,17774,9215,6034,4522,5471,8180,1835,8308,30880,530


In [33]:
excel = pd.ExcelWriter(f"{regression_result_dir}/number_obs_country_discipline.xlsx")
df.to_excel(excel)
excel.close()

#### VIF test

In [34]:
# vif test
vifs = []
IVs = ['DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark','ForeignIM_mark', 'N_refs', 'N_ins',
       'N_c', 'N_a']
for file in files:
    df = pd.read_csv(file)
    df.columns = ['UT', 'TC', 'DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark','ForeignIM_mark', 'N_refs', 'N_ins',
       'N_c', 'N_a', 'Subject']

    df = df[df['N_a']<=10] # only consider authors <= 10
    country = file.split('\\')[-1].split('.')[0]
    for iv in ['DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark', 'ForeignIM_mark']:
        df[f'{iv}'] = df[f'{iv}'].apply(lambda x:1 if x=='Y' else 0)
    for cate,data in df.groupby('Subject'):
        X = add_constant(data[IVs])
        vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        vifs.append([country,cate]+vif)

df_vif = pd.DataFrame(vifs, columns=['Country','Category']+list(X.columns))
df_vif

Unnamed: 0,Country,Category,const,DomesticNM_mark,DomesticIM_mark,ForeignNM_mark,ForeignIM_mark,N_refs,N_ins,N_c,N_a
0,BR,AGR,18.530956,1.060037,1.211888,1.108075,1.241542,1.159229,1.296338,1.713922,1.119690
1,BR,BIO,16.399821,1.188910,1.245379,1.258686,1.284104,1.028372,1.797475,2.064364,1.106289
2,BR,CHE,17.467469,1.177435,1.226529,1.179001,1.305664,1.021160,1.732024,1.883072,1.202658
3,BR,CLI,15.308445,1.222377,1.176643,1.255745,1.209599,1.036266,1.812964,1.996822,1.109240
4,BR,COM,14.365915,1.215298,1.093997,1.133444,1.259180,1.025046,1.997995,1.885732,1.269868
5,BR,ENG,15.430703,1.129447,1.124476,1.160198,1.171233,1.050383,1.805027,1.658848,1.223371
6,BR,ENV,12.444946,1.181756,1.238318,1.194680,1.325276,1.125296,1.958191,2.198936,1.242411
7,BR,GEO,11.336970,1.149493,1.088253,1.246704,1.288549,1.139872,2.528635,2.072367,1.507647
8,BR,IMM,17.057812,1.276073,1.184368,1.264594,1.282007,1.044671,1.977626,2.192313,1.109057
9,BR,MATE,15.896360,1.158964,1.236561,1.235553,1.239276,1.062306,1.769546,1.855672,1.255964


In [35]:
excel = pd.ExcelWriter(f'{regression_result_dir}/country_discipline_VIF.xlsx')
df_vif.to_excel(excel,index=False)
excel.close()

In [36]:
df_vif.max()

Country                 ZA
Category               SPA
const              23.9502
DomesticNM_mark     1.7696
DomesticIM_mark    1.80785
ForeignNM_mark     2.27201
ForeignIM_mark     1.69554
N_refs             1.37947
N_ins              4.75107
N_c                3.92743
N_a                2.62247
dtype: object

#### Fitting

For each country and subject, we fit the regression model.

In [37]:
# fitting
IVs = ['DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark', 'ForeignIM_mark',
       'N_refs', 'N_ins', 'N_c', 'N_a']

for file in files:
    df = pd.read_csv(file)
    df.columns = ['UT', 'TC', 'DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark','ForeignIM_mark', 'N_refs', 'N_ins',
       'N_c', 'N_a', 'Subject']
    df = df[df['N_a']<=10]
    df['LogTC'] = np.log(df['TC']+1)
    country = file.split('\\')[-1].split('.')[0]
    os.makedirs(f"{regression_result_dir}/IC/{country}/", exist_ok=True)
    print(country,end='\r')
    for iv in ['DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark', 'ForeignIM_mark']:
        df[f'{iv}'] = df[f'{iv}'].apply(lambda x:1 if x=='Y' else 0)
#cates = list(df['Subject'].unique())
    models = [fit_model(data,cate,IVs,f"{regression_result_dir}/IC/{country}/{cate}.txt") for cate,data in df.groupby('Subject')]
    cols = ['Subject','Intercept']+IVs+['R-Squared']+['AIC','BIC']
    models = pd.DataFrame(models, columns=cols)
    os.makedirs(f"{regression_result_dir}/countries_summary/", exist_ok=True)
    excel = pd.ExcelWriter(f'{regression_result_dir}/countries_summary/IC_{country}.xlsx')
    models.to_excel(excel,index=False)
    excel.close()

ZA

#### Merge tables

In [38]:
files = glob.glob(f"{regression_result_dir}/countries_summary/IC_*.xlsx")

In [39]:
files

['D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_BR.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_CA.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_CN.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_DE.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_FR.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_IN.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_IT.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_JP.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_RU.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_UK.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_US.xlsx',
 'D:/Data/MultipleAffiliations/result//LM_log//countries_summary\\IC_ZA.xlsx']

In [40]:
IVs = ['DomesticNM_mark', 'DomesticIM_mark', 'ForeignNM_mark', 'ForeignIM_mark',
       'N_refs', 'N_ins', 'N_c', 'N_a']

In [41]:
df = pd.read_excel(files[0])
df.head(2)

Unnamed: 0,Subject,Intercept,DomesticNM_mark,DomesticIM_mark,ForeignNM_mark,ForeignIM_mark,N_refs,N_ins,N_c,N_a,R-Squared,AIC,BIC
0,AGR,0.07*,0.20***,0.22***,0.22***,0.06,0.02***,-0.07***,0.19***,0.03***,0.28,17133.811059,17196.934512
1,BIO,0.51***,0.05,0.13**,0.08,0.01,0.01***,-0.07***,0.25***,0.04***,0.19,8312.868148,8368.29737


In [42]:
from collections import defaultdict
frames = defaultdict(list)

for file in files:
    df = pd.read_excel(file)
    for iv in IVs:
        frames[iv].append(df[['Subject',iv]].set_index('Subject'))

In [43]:
countries = [file.split('_')[-1].split('.')[0] for file in files]

In [44]:
for iv in IVs:
    df = pd.concat(frames[iv], axis=1)
    #print(df.head(2))
    df.columns = countries
    excel = pd.ExcelWriter(f"{regression_result_dir}/IC/IC_{iv}.xlsx")
    idx = ['SPA','NEU','PSY','IMM','CLI','PHA','PHY','MOL','BIO','MIC','PLA','ENV','GEO','CHE','AGR','MATE','COM','ENG','MATH']
    cols = ['CA','DE','FR','UK','IT','JP','US','BR','CN','IN','RU','ZA']
    df.loc[idx,cols].to_excel(excel)
    excel.close()