In [61]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import os
import numpy as np
from pytrends.request import TrendReq
from collections import Counter

In [2]:
df = pd.read_csv('../data/raw_data/all.csv')
# Because journal rank and country rank data are only available between 2000 and 2017
# df= df[(df['PY'] >=2000) &(df['PY'] <=2017)]    haven't decided ###############
# fill missing key words with blank
df['DE'] = df['DE'].fillna('')
# concate title and key word
df['Topic'] = df['TI'] + df['DE']

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# tokenize and filter stop words and lemmatize
stop = set(stopwords.words('english'))
wordnet = WordNetLemmatizer()
wordlist = set(words.words())
def nlp_preprocess(doc):
    tokenized = word_tokenize(doc.lower())
    tokenized = [word for word in tokenized if word not in stop]
    docs_lemma = ' '.join([wordnet.lemmatize(word) for word in tokenized])
    return docs_lemma

df['Topic_2'] = df['Topic'].apply(nlp_preprocess)

In [4]:
# extract 15 topics
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=1000,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.Topic_2)

n_topics = 15
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=15, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [5]:
# print 20 top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
surface protein silk modification fibroin adsorption film chemical plasma polymer chemistry printing silica low biomaterials interaction functionalization mesh doped biocompatible
Topic #1:
polymer poly biodegradable membrane glycol ethylene biomaterials polymerization phase based ray process functional shape polyethylene grafting memory injectable orthopedic kinetics
Topic #2:
alloy titanium coating corrosion ti fiber magnesium biomaterials porous hydroxyapatite microstructure deposition behavior surface treatment ion situ plasma infection resistance
Topic #3:
hydrogel delivery drug synthesis chitosan release based characterization controlled polymer fabrication poly acid gelatin responsive cross nanoparticles biomaterials preparation network
Topic #4:
vitro biocompatibility model evaluation biological vivo study biomaterials layer metal activity antimicrobial interface cytotoxicity silver coated bio modeling biomaterial testing
Topic #5:
cell stem human adhesion mesenchymal

In [6]:
# For each documents choose the most relevant topic
doc_topic = lda.transform(tf)
_ = []
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    _.append(topic_most_pr)
df['Cluster_Topic'] = _

In [7]:
topic_num = df.groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

#Because we are predicting topic number next year
topic_num.PY = topic_num.PY.astype(int)
topic_num['PY'] = topic_num['PY'] +1
topic_num.PY = topic_num.PY.astype(str)
topic_num.Cluster_Topic = topic_num.Cluster_Topic.astype(str)
# create key for join format: topic_year
topic_num['Topic_Year'] = topic_num[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)

topic_num = topic_num[['Topic_Year','Topic_num']]



In [8]:
#calculate avg citation percent of the year per topic
citation = df.groupby(['PY','Cluster_Topic'])['Z9'].sum().unstack().unstack().reset_index().rename(columns = {0:'Sum_Citation'})
citation.PY = citation.PY.astype(int)
citation.PY = citation.PY.astype(str)
citation.Cluster_Topic = citation.Cluster_Topic.astype(str)
citation['Topic_Year'] = citation[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
# citation = citation[['Topic_Year','Sum_Citation']]
map_dict = citation.groupby(['PY'])['Sum_Citation'].sum().reset_index().set_index('PY').to_dict()['Sum_Citation']
citation['Year_Sum'] = citation['PY'].map(map_dict)

result = pd.merge(topic_num,citation)

result['Citation_feature'] = (result['Sum_Citation']/result['Year_Sum'])/result['Topic_num']

In [9]:
result = result[['Topic_Year','Citation_feature','Topic_num']]

In [10]:
#For each topic in each year, calculate the ratio of number of paper
#published this year to the number of paper published in latest five years
tmplist = []
for year in range(1996,2019):
    tmp = df[df['PY'].between(year-4,year)].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmp2 = df[df['PY'] == year].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmpdf = tmp2['Topic_num']/tmp.groupby(['Cluster_Topic'])['Topic_num'].sum().tolist()

    year = year
    tmpyear = [str(i) + '_' + str(year) for i in range(15)]

    tmplist.append(pd.DataFrame({'Topic_Year':tmpyear,'Five_Year_Percent':tmpdf}))


In [11]:
five_year_percent = pd.concat(tmplist,ignore_index=True)

result = pd.merge(result,five_year_percent)

In [12]:
#For each topic in each year, calculate the ratio of number of paper
#published this year to the number of paper published in latest five years
tmplist = []
for year in range(1996,2019):
    tmp = df[df['PY'].between(year-2,year)].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmp2 = df[df['PY'] == year].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmpdf = tmp2['Topic_num']/tmp.groupby(['Cluster_Topic'])['Topic_num'].sum().tolist()

    year = year
    tmpyear = [str(i) + '_' + str(year) for i in range(15)]

    tmplist.append(pd.DataFrame({'Topic_Year':tmpyear,'Three_Year_Percent':tmpdf}))


In [13]:
three_year_percent = pd.concat(tmplist,ignore_index=True)

result = pd.merge(result,three_year_percent)

In [14]:
sjr_dir = '../data/sjrdata'
files = os.listdir(sjr_dir)

def lower_dict(d):
    data_SJR_dict = d.set_index('Title')['SJR'].to_dict()
    new_dict = dict((k.lower(), v) for k, v in data_SJR_dict.items())
    return new_dict

for file in files:
    year = file.split(' ')[1].split('.')[0]

    tmpdf = pd.read_csv(os.path.join(sjr_dir,file), delimiter=';',usecols = ['Title','SJR'])

    df.loc[df['PY'] == int(year),'SJR'] = df['SO'].str.lower().map(lower_dict(tmpdf))

df['SJR'] = df['SJR'].fillna(0)
df['SJR'] = df['SJR'].apply(lambda x: float(str(x).replace(',','.')))

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
#Sum SJR
sjr = df.groupby(['PY','Cluster_Topic'])['SJR'].sum().unstack().unstack().reset_index().rename(columns = {0:'Sum_SJR'})
sjr.PY = sjr.PY.astype(int)
sjr.PY = sjr.PY.astype(str)
sjr.Cluster_Topic = sjr.Cluster_Topic.astype(str)
# create key for join format: topic_year
sjr['Topic_Year'] = sjr[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
sjr = sjr[['Topic_Year','Sum_SJR']]

In [16]:
result = pd.merge(result,sjr)

In [17]:
#Average SJR
sjr = df.groupby(['PY','Cluster_Topic'])['SJR'].mean().unstack().unstack().reset_index().rename(columns = {0:'Avg_SJR'})
sjr.PY = sjr.PY.astype(int)
sjr.PY = sjr.PY.astype(str)
sjr.Cluster_Topic = sjr.Cluster_Topic.astype(str)
# create key for join format: topic_year
sjr['Topic_Year'] = sjr[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
sjr = sjr[['Topic_Year','Avg_SJR']]
#merge
result = pd.merge(result,sjr)

In [18]:
result['Year'] = result['Topic_Year'].apply(lambda x:x.split('_')[1])
result['Topic'] = result['Topic_Year'].apply(lambda x:x.split('_')[0])
# calculate certain topic's growth rate by year
result['Growth_Rate'] = result['Topic_num'].pct_change()
# calculate certain topic's citation_growth_rate by year
result['Citation_Growth_Rate'] = result['Citation_feature'].pct_change()
#set first column to nan
result.loc[result['Year'] == '2002','Growth_Rate'] = np.NaN
#set first column to nan
result.loc[result['Year'] == '2002','Citation_Growth_Rate'] = np.NaN

In [19]:
# calculate growth rate by year
map_dict = result.groupby(['Year'])['Topic_num'].sum().pct_change().reset_index().set_index('Year').to_dict()

map_dict= map_dict['Topic_num']

result['Year_Growth_Rate'] = result['Year'].map(map_dict)

result.loc[result['Year'] == '2002','Year_Growth_Rate'] = np.NaN

In [20]:
# if topic's growth rate is larger than the growth rate of that year then set target to 1 else 0
result['Target'] = result['Growth_Rate']  > result['Year_Growth_Rate']

result['Target'] =result.Target.apply(lambda x: int(x==True))

In [21]:
#shift feature by year
def shift_col_by_year(col,year,df):
    new_col = '_'.join([col,str(year)])
    df[new_col] = df[col].shift(year)
    for i in range(2002,2002+year):
        df.loc[df['Year'] == str(i),new_col] = np.NaN

In [22]:
shift_col_by_year('Avg_SJR',1,result)
shift_col_by_year('Avg_SJR',2,result)
shift_col_by_year('Avg_SJR',3,result)
shift_col_by_year('Sum_SJR',1,result)
shift_col_by_year('Sum_SJR',2,result)
shift_col_by_year('Sum_SJR',3,result)

## Google Trend feature

In [73]:
def calculate_topic_most_k_common(topic_num,k):
    tmp = df[df['Cluster_Topic'] == topic_num]
    cunter = Counter([i.strip().lower() for i in ';'.join(tmp.DE.values).split(';') if i]).most_common(k)
    return [i[0] for i in cunter if i[0] != 'biomaterials' and i[0] != 'biomaterial']

keyword_dict = {}
for topic_num in range(15):
    keyword_dict[topic_num] = calculate_topic_most_k_common(topic_num,10)[:4] #only save 4 words each topic

In [137]:
def get_topic_trend(topic,keyword_dict):
    pytrends = TrendReq(hl='en-US', tz=360)
    #get google trends for 5 words for each topic. 5 words contain 4 top key words and 'biomaterial' as benchmark
    kw_list = keyword_dict[topic] + ['biomaterials'] #############
    pytrends.build_payload(kw_list, timeframe='2004-01-01 2018-12-31')
    # Interest Over Time
    key_trends = pytrends.interest_over_time().reset_index()
    key_trends['Year'] = key_trends['date'].apply(lambda x:x.year)
    key_trends = key_trends[kw_list + ['Year']] #only keep words columns
    key_trends['Topic_trend'] = key_trends[keyword_dict[topic]].sum(axis = 1) #sum up 4 key words's trends 
    tmp = (key_trends.groupby('Year')['Topic_trend'].sum() / key_trends.groupby('Year')['biomaterials'].sum()) \
        .reset_index().rename(columns = {0:'Google Trend','Year':'Topic_Year'})
    tmp['Topic_Year'] = tmp['Topic_Year'].apply(lambda x: '_'.join([str(topic),str(x)])) ####
    return tmp

In [138]:
tmplist = []
for i in range(15):
    tmplist.append(get_topic_trend(i,keyword_dict))
google_trend = pd.concat(tmplist,ignore_index=True)

In [156]:
#Count number of reviews each year per topic
reviews = df[df['DT'] == 'Review']
reviews = reviews.groupby(['PY','Cluster_Topic'])['DT'].count().unstack().unstack().reset_index().rename(columns = {0:'Review_AMT'})
reviews.PY = reviews.PY.astype(int)
reviews.PY = reviews.PY.astype(str)
reviews.Cluster_Topic = reviews.Cluster_Topic.astype(str)
# create key for join format: topic_year
reviews['Topic_Year'] = reviews[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
reviews = reviews[['Topic_Year','Review_AMT']]
reviews = reviews.fillna(0)

result = pd.merge(result,reviews)

In [159]:
result.to_csv('../output/result.csv',index=False)

In [158]:
result.head()

Unnamed: 0,Topic_Year,Citation_feature,Topic_num,Five_Year_Percent,Three_Year_Percent,Sum_SJR,Avg_SJR,Year,Topic,Growth_Rate,...,Year_Growth_Rate,Target,Avg_SJR_1,Avg_SJR_2,Avg_SJR_3,Sum_SJR_1,Sum_SJR_2,Sum_SJR_3,Google Trend,Review_AMT
0,0_2004,0.001447,69.0,0.245223,0.351598,62.657,0.813727,2004,0,-0.054795,...,0.160839,0,1.020058,1.056329,,70.384,77.112,,0.480263,4.0
1,0_2005,0.001029,77.0,0.259459,0.396694,94.646,0.985896,2005,0,0.115942,...,0.173494,0,0.813727,1.020058,1.056329,62.657,70.384,77.112,0.362663,5.0
2,0_2006,0.000724,96.0,0.202532,0.316206,78.86,0.98575,2006,0,0.246753,...,0.361396,0,0.985896,0.813727,1.020058,94.646,62.657,70.384,0.395802,13.0
3,0_2007,0.001357,80.0,0.329167,0.473054,183.868,1.163722,2007,0,-0.166667,...,-0.136501,0,0.98575,0.985896,0.813727,78.86,94.646,62.657,0.424408,13.0
4,0_2008,0.000527,158.0,0.25,0.365333,163.017,1.189905,2008,0,0.975,...,0.461135,1,1.163722,0.98575,0.985896,183.868,78.86,94.646,0.379377,14.0
