Based on the ranking of words, pick the top ten and recommend a journal for each word. 

In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from util import clean_journal_ref

In [2]:
word_rankings = pd.read_csv("word_rankings.csv")
top_10_words = word_rankings['title'][:10]
top_10_words

0      model
1    quantum
2     system
3     theori
4      field
5     effect
6      gener
7      dynam
8        use
9    network
Name: title, dtype: object

In [3]:
cluster = LocalCluster() 
cluster.scale(8) 

# Sets the number of workers 
cluster.adapt(minimum=1, maximum=8) 

# Allows the cluster to auto scale to 10 when tasks are computed 
client = Client(cluster)

defined_dtypes = {'id': str, 'title': str, 'comments':  str, 'journal-ref':  str, 'journal-ref_cleaned':  str,
                  'categories':  str, 'categories_original':str, "journal-ref_original":str,
                  'feild':str, 'category':str, 'sub_category':str}

df = dd.read_csv('without_covid_cleaned.csv', blocksize="256 MiB", dtype=defined_dtypes)
df.head(5)



Unnamed: 0,id,title,comments,journal-ref_original,categories_original,journal-ref_cleaned,first_category,category,sub_category,feild
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph,physics.review,hep-ph,hep-ph,,physics
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg,,math.co,math,co,math
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph,,physics.gen-ph,physics,gen-ph,physics
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co,,math.co,math,co,math
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall,,cond-mat.mes-hall,cond-mat,mes-hall,physics


In [5]:
journal_totals = df['journal-ref_cleaned'].value_counts().compute()
journal_totals_dict = journal_totals.to_dict()
print(journal_totals_dict)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def lemmatize_sentence(sentence):
    # initial lemmatiser
    stemmer = PorterStemmer()
    # Tokenize the sentence into individual words and apply lemmatiser
    lemmatized_words = [stemmer.stem(word) for word in nltk.word_tokenize(sentence)]
    # return words reconnected as a sentence
    return ' '.join(lemmatized_words)

# First we have to apply the same lemmatisation to the titles as we did to the key words
titles_lemmatised = df['title'].apply(lemmatize_sentence, meta=('title', str)).compute()

In [11]:
def reccommend_journals(keyword):    
    #select only rows where title contains keyword
    keyword_df = df[titles_lemmatised.str.contains(keyword)]

    #group by journal-ref
    journal_frequency = keyword_df.groupby('journal-ref_cleaned').size()

    return journal_frequency

In [12]:
writer = pd.ExcelWriter('journal_reccomendations.xlsx')

for keyword in top_10_words:
    # apply reccommend_journals to return the no. articles containing the keyword for each journal-ref
    top_journals = reccommend_journals(keyword).compute()

    # Create an empty dataframe to populate
    top_journals_df = pd.DataFrame({'top_journals':top_journals.index, 'count':top_journals.values})

    # create a total column for the number of articles containing the keyword
    top_journals_df['total']= top_journals_df['top_journals'].map(journal_totals_dict)
    
    # create a percentage column that divides total by the total no. articles in that journal-ref
    top_journals_df['perc']= (top_journals_df['count']/top_journals_df['total'])*100

    # only return journals with more than 100 articles.
    valid_journals = top_journals_df[top_journals_df['total']>=100]
    
    # save the resulting datafram to a sheet with the keyword as the sheetname
    valid_journals.to_excel(writer, sheet_name=keyword, index=False)
    
    print(keyword, 'done :)')

  meta = self._meta[_extract_meta(key)]


model done :)


  meta = self._meta[_extract_meta(key)]


quantum done :)


  meta = self._meta[_extract_meta(key)]


system done :)


  meta = self._meta[_extract_meta(key)]


theori done :)


  meta = self._meta[_extract_meta(key)]


field done :)


  meta = self._meta[_extract_meta(key)]


effect done :)


  meta = self._meta[_extract_meta(key)]


gener done :)


  meta = self._meta[_extract_meta(key)]


dynam done :)


  meta = self._meta[_extract_meta(key)]


use done :)


  meta = self._meta[_extract_meta(key)]


network done :)


In [None]:
#client.close()

In [13]:
writer.close()