In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import os
import numpy as np

In [2]:
df = pd.read_csv('../data/raw_data/all.csv')
# Because journal rank and country rank data are only available between 2000 and 2017
# df= df[(df['PY'] >=2000) &(df['PY'] <=2017)]    haven't decided ###############
# fill missing key words with blank
df['DE'] = df['DE'].fillna('')
# concate title and key word
df['Topic'] = df['TI'] + df['DE']

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# tokenize and filter stop words and lemmatize
stop = set(stopwords.words('english'))
wordnet = WordNetLemmatizer()
wordlist = set(words.words())
def nlp_preprocess(doc):
    tokenized = word_tokenize(doc.lower())
    tokenized = [word for word in tokenized if word not in stop]
    docs_lemma = ' '.join([wordnet.lemmatize(word) for word in tokenized])
    return docs_lemma

df['Topic_2'] = df['Topic'].apply(nlp_preprocess)

In [6]:
# extract 15 topics
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=1000,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.Topic_2)

n_topics = 15
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=15, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [7]:
# print 20 top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
surface protein silk modification fibroin adsorption film chemical plasma polymer chemistry printing silica low biomaterials interaction functionalization mesh doped biocompatible
Topic #1:
polymer poly biodegradable membrane glycol ethylene biomaterials polymerization phase based ray process functional shape polyethylene grafting memory injectable orthopedic kinetics
Topic #2:
alloy titanium coating corrosion ti fiber magnesium biomaterials porous hydroxyapatite microstructure deposition behavior surface treatment ion situ plasma infection resistance
Topic #3:
hydrogel delivery drug synthesis chitosan release based characterization controlled polymer fabrication poly acid gelatin responsive cross nanoparticles biomaterials preparation network
Topic #4:
vitro biocompatibility model evaluation biological vivo study biomaterials layer metal activity antimicrobial interface cytotoxicity silver coated bio modeling biomaterial testing
Topic #5:
cell stem human adhesion mesenchymal

In [8]:
# For each documents choose the most relevant topic
doc_topic = lda.transform(tf)
_ = []
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    _.append(topic_most_pr)
df['Cluster_Topic'] = _

In [9]:
topic_num = df.groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

#Because we are predicting topic number next year
topic_num.PY = topic_num.PY.astype(int)
topic_num['PY'] = topic_num['PY'] +1
topic_num.PY = topic_num.PY.astype(str)
topic_num.Cluster_Topic = topic_num.Cluster_Topic.astype(str)
# create key for join format: topic_year
topic_num['Topic_Year'] = topic_num[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)

topic_num = topic_num[['Topic_Year','Topic_num']]



In [44]:
#calculate avg citation percent of the year per topic
citation = df.groupby(['PY','Cluster_Topic'])['Z9'].sum().unstack().unstack().reset_index().rename(columns = {0:'Sum_Citation'})
citation.PY = citation.PY.astype(int)
citation.PY = citation.PY.astype(str)
citation.Cluster_Topic = citation.Cluster_Topic.astype(str)
citation['Topic_Year'] = citation[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
# citation = citation[['Topic_Year','Sum_Citation']]
map_dict = citation.groupby(['PY'])['Sum_Citation'].sum().reset_index().set_index('PY').to_dict()['Sum_Citation']
citation['Year_Sum'] = citation['PY'].map(map_dict)

result = pd.merge(topic_num,citation)

result['Citation_feature'] = (result['Sum_Citation']/result['Year_Sum'])/result['Topic_num']

In [46]:
result = result[['Topic_Year','Citation_feature','Topic_num']]

In [47]:
#For each topic in each year, calculate the ratio of number of paper
#published this year to the number of paper published in latest five years
tmplist = []
for year in range(1996,2019):
    tmp = df[df['PY'].between(year-4,year)].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmp2 = df[df['PY'] == year].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmpdf = tmp2['Topic_num']/tmp.groupby(['Cluster_Topic'])['Topic_num'].sum().tolist()

    year = year
    tmpyear = [str(i) + '_' + str(year) for i in range(15)]

    tmplist.append(pd.DataFrame({'Topic_Year':tmpyear,'Five_Year_Percent':tmpdf}))


In [48]:
five_year_percent = pd.concat(tmplist,ignore_index=True)

result = pd.merge(result,five_year_percent)

In [49]:
#For each topic in each year, calculate the ratio of number of paper
#published this year to the number of paper published in latest five years
tmplist = []
for year in range(1996,2019):
    tmp = df[df['PY'].between(year-2,year)].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmp2 = df[df['PY'] == year].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmpdf = tmp2['Topic_num']/tmp.groupby(['Cluster_Topic'])['Topic_num'].sum().tolist()

    year = year
    tmpyear = [str(i) + '_' + str(year) for i in range(15)]

    tmplist.append(pd.DataFrame({'Topic_Year':tmpyear,'Three_Year_Percent':tmpdf}))


In [50]:
three_year_percent = pd.concat(tmplist,ignore_index=True)

result = pd.merge(result,three_year_percent)

In [51]:
sjr_dir = '../data/sjrdata'
files = os.listdir(sjr_dir)

def lower_dict(d):
    data_SJR_dict = d.set_index('Title')['SJR'].to_dict()
    new_dict = dict((k.lower(), v) for k, v in data_SJR_dict.items())
    return new_dict

for file in files:
    year = file.split(' ')[1].split('.')[0]

    tmpdf = pd.read_csv(os.path.join(sjr_dir,file), delimiter=';',usecols = ['Title','SJR'])

    df.loc[df['PY'] == int(year),'SJR'] = df['SO'].str.lower().map(lower_dict(tmpdf))

df['SJR'] = df['SJR'].fillna(0)
df['SJR'] = df['SJR'].apply(lambda x: float(str(x).replace(',','.')))

  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
#Sum SJR
sjr = df.groupby(['PY','Cluster_Topic'])['SJR'].sum().unstack().unstack().reset_index().rename(columns = {0:'Sum_SJR'})
sjr.PY = sjr.PY.astype(int)
sjr.PY = sjr.PY.astype(str)
sjr.Cluster_Topic = sjr.Cluster_Topic.astype(str)
# create key for join format: topic_year
sjr['Topic_Year'] = sjr[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
sjr = sjr[['Topic_Year','Sum_SJR']]

In [53]:
result = pd.merge(result,sjr)

In [54]:
#Average SJR
sjr = df.groupby(['PY','Cluster_Topic'])['SJR'].mean().unstack().unstack().reset_index().rename(columns = {0:'Avg_SJR'})
sjr.PY = sjr.PY.astype(int)
sjr.PY = sjr.PY.astype(str)
sjr.Cluster_Topic = sjr.Cluster_Topic.astype(str)
# create key for join format: topic_year
sjr['Topic_Year'] = sjr[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
sjr = sjr[['Topic_Year','Avg_SJR']]
#merge
result = pd.merge(result,sjr)

In [56]:
result['Year'] = result['Topic_Year'].apply(lambda x:x.split('_')[1])
result['Topic'] = result['Topic_Year'].apply(lambda x:x.split('_')[0])
# calculate certain topic's growth rate by year
result['Growth_Rate'] = result['Topic_num'].pct_change()
# calculate certain topic's citation_growth_rate by year
result['Citation_Growth_Rate'] = result['Citation_feature'].pct_change()
#set first column to nan
result.loc[result['Year'] == '2002','Growth_Rate'] = np.NaN
#set first column to nan
result.loc[result['Year'] == '2002','Citation_Growth_Rate'] = np.NaN

In [57]:
# calculate growth rate by year
map_dict = result.groupby(['Year'])['Topic_num'].sum().pct_change().reset_index().set_index('Year').to_dict()

map_dict= map_dict['Topic_num']

result['Year_Growth_Rate'] = result['Year'].map(map_dict)

result.loc[result['Year'] == '2002','Year_Growth_Rate'] = np.NaN

In [58]:
# if topic's growth rate is larger than the growth rate of that year then set target to 1 else 0
result['Target'] = result['Growth_Rate']  > result['Year_Growth_Rate']

result['Target'] =result.Target.apply(lambda x: int(x==True))

In [59]:
#shift feature by year
def shift_col_by_year(col,year,df):
    new_col = '_'.join([col,str(year)])
    df[new_col] = df[col].shift(year)
    for i in range(2002,2002+year):
        df.loc[df['Year'] == str(i),new_col] = np.NaN

In [60]:
shift_col_by_year('Avg_SJR',1,result)
shift_col_by_year('Avg_SJR',2,result)
shift_col_by_year('Avg_SJR',3,result)
shift_col_by_year('Sum_SJR',1,result)
shift_col_by_year('Sum_SJR',2,result)
shift_col_by_year('Sum_SJR',3,result)

In [62]:
result.to_csv('../output/result.csv',index=False)

In [61]:
result.head()

Unnamed: 0,Topic_Year,Citation_feature,Topic_num,Five_Year_Percent,Three_Year_Percent,Sum_SJR,Avg_SJR,Year,Topic,Growth_Rate,Citation_Growth_Rate,Year_Growth_Rate,Target,Avg_SJR_1,Avg_SJR_2,Avg_SJR_3,Sum_SJR_1,Sum_SJR_2,Sum_SJR_3
0,0_1996,0.006168,25.0,0.338583,0.462366,0.0,0.0,1996,0,,,,0,,,,,,
1,0_1997,0.002142,43.0,0.234483,0.333333,0.0,0.0,1997,0,0.72,-0.6527,0.180272,1,0.0,,,0.0,,
2,0_1998,0.003411,34.0,0.22561,0.324561,0.0,0.0,1998,0,-0.209302,0.592603,0.083573,0,0.0,0.0,,0.0,0.0,
3,0_1999,0.00336,37.0,0.223464,0.36036,37.14,0.9285,1999,0,0.088235,-0.015133,0.191489,0,0.0,0.0,0.0,0.0,0.0,0.0
4,0_2000,0.001865,40.0,0.206186,0.34188,45.009,1.125225,2000,0,0.081081,-0.445003,0.100446,0,0.9285,0.0,0.0,37.14,0.0,0.0
