In [36]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [179]:
df = pd.read_csv('../data/raw_data/all.csv')
# Because journal rank and country rank data are only available between 2000 and 2017
# df= df[(df['PY'] >=2000) &(df['PY'] <=2017)]    haven't decided ###############
# fill missing key words with blank
df['DE'] = df['DE'].fillna('')
# concate title and key word
df['Topic'] = df['TI'] + df['DE']

  interactivity=interactivity, compiler=compiler, result=result)


In [180]:
# tokenize and filter stop words and lemmatize
stop = set(stopwords.words('english'))
wordnet = WordNetLemmatizer()
wordlist = set(words.words())
def nlp_preprocess(doc):
    tokenized = word_tokenize(doc.lower())
    tokenized = [word for word in tokenized if word not in stop]
    docs_lemma = ' '.join([wordnet.lemmatize(word) for word in tokenized])
    return docs_lemma

df['Topic_2'] = df['Topic'].apply(nlp_preprocess)

In [181]:
# extract 15 topics
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=1000,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.Topic_2)

n_topics = 15
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=15, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [182]:
# print 20 top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
surface poly porous modification adhesion adsorption protein plasma treatment polymer chemical modified glycol ethylene bacterial coating caprolactone titanium epsilon functionalization
Topic #1:
model fiber polymerization induced printing using polyester biomaterials steel stainless small animal bioprinting disease nanostructures nanomaterials fluorescence long simulation migration
Topic #2:
hydroxyapatite phosphate bone calcium composite glass bioactive ceramic alginate formation nerve natural bioactivity cement ph crystal tricalcium apatite sintering fracture
Topic #3:
film laser microscopy activity spectroscopy chemistry deposition body fibroblast force morphology electron cancer biomaterials polysaccharide alcohol vinyl optical ray enzyme
Topic #4:
property mechanical alloy corrosion coating biomaterials behavior titanium ti polyurethane composite effect antibacterial magnesium thermal based microstructure biomedical material nanocomposites
Topic #5:
chitosan vitro osteo

In [183]:
# For each documents choose the most relevant topic
doc_topic = lda.transform(tf)
_ = []
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    _.append(topic_most_pr)
df['Cluster_Topic'] = _

In [221]:
topic_num = df.groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})
citation = df.groupby(['PY','Cluster_Topic'])['Z9'].sum().unstack().unstack().reset_index().rename(columns = {0:'Sum_Citation'})
#Because we are predicting topic number next year
topic_num.PY = topic_num.PY.astype(int)
topic_num['PY'] = topic_num['PY'] +1
topic_num.PY = topic_num.PY.astype(str)
topic_num.Cluster_Topic = topic_num.Cluster_Topic.astype(str)
citation.PY = citation.PY.astype(int)
citation.PY = citation.PY.astype(str)
citation.Cluster_Topic = citation.Cluster_Topic.astype(str)
# create key for join format: topic_year
topic_num['Topic_Year'] = topic_num[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
citation['Topic_Year'] = citation[['Cluster_Topic','PY']].apply(lambda x: '_'.join(x),axis =1)
topic_num = topic_num[['Topic_Year','Topic_num']]
citation = citation[['Topic_Year','Sum_Citation']]

In [222]:
result = pd.merge(topic_num,citation)

In [241]:
#For each topic in each year, calculate the ratio of number of paper
#published this year to the number of paper published in latest five years
tmplist = []
for year in range(2001,2019):
    tmp = df[df['PY'].between(year-4,year)].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmp.groupby(['Cluster_Topic'])['Topic_num'].sum()

    tmp2 = df[df['PY'] == year].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmpdf = tmp2['Topic_num']/tmp.groupby(['Cluster_Topic'])['Topic_num'].sum().tolist()

    year = year
    tmpyear = [str(i) + '_' + str(year) for i in range(15)]

    tmplist.append(pd.DataFrame({'Topic_Year':tmpyear,'Five_Year_Percent':tmpdf}))


In [None]:
five_year_percent = pd.concat(tmplist,ignore_index=True)

result = pd.merge(result,five_year_percent)

In [250]:
#For each topic in each year, calculate the ratio of number of paper
#published this year to the number of paper published in latest five years
tmplist = []
for year in range(2001,2019):
    tmp = df[df['PY'].between(year-2,year)].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmp.groupby(['Cluster_Topic'])['Topic_num'].sum()

    tmp2 = df[df['PY'] == year].groupby(['PY'])['Cluster_Topic'].value_counts().unstack().unstack().reset_index().rename(columns = {0:'Topic_num'})

    tmpdf = tmp2['Topic_num']/tmp.groupby(['Cluster_Topic'])['Topic_num'].sum().tolist()

    year = year
    tmpyear = [str(i) + '_' + str(year) for i in range(15)]

    tmplist.append(pd.DataFrame({'Topic_Year':tmpyear,'Three_Year_Percent':tmpdf}))


In [251]:
three_year_percent = pd.concat(tmplist,ignore_index=True)

result = pd.merge(result,three_year_percent)

In [252]:
result.head()

Unnamed: 0,Topic_Year,Topic_num,Sum_Citation,Five_Year_Percent,Three_Year_Percent
0,0_2002,72.0,7219.0,0.341549,0.573964
1,0_2003,97.0,6510.0,0.332353,0.400709
2,0_2004,113.0,6138.0,0.296758,0.361702
3,0_2005,119.0,7152.0,0.251866,0.367847
4,0_2006,135.0,4255.0,0.222781,0.343669


In [253]:
result.to_csv('result.csv',index=False)