# Imports

In [1]:
import gzip
import joblib
import re
import numpy as np
import csv        
import os
# Change directory to the root of the folder (this script was launched from the subfolder python_scripts)
# All utils presuppose that we are working from the root directory of the github folder
os.chdir("../")
from tqdm.notebook import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from datetime import datetime
import jsonlines
import pandas as pd
import cld3
from iso639 import languages

# Collecting data from Corpus

Download corpus from
https://api.semanticscholar.org/corpus/download/

Release downloaded: 2022-01-01 release

To download the corpus, before continuing with this notebook, from the root folder, run the following commands:

```
mkdir -p data
mkdir -p data/semanticscholar
mkdir -p data/semanticscholar/2022-01-01
mkdir -p data/semanticscholar/2022-01-01/corpus
cd data/semanticscholar/2022-01-01/corpus/
wget https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2022-01-01/manifest.txt
wget -B https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2022-01-01/ -i manifest.txt
```

In [2]:
corpus_version = '2022-01-01'
corpus_folder = os.path.join('./data/semanticscholar/', corpus_version, 'corpus') # where you have the corpus
data_folder = os.path.join('./data/semanticscholar/', corpus_version, 'data') # where you save the data
os.makedirs(data_folder, exist_ok = True)

In [3]:
def get_numbers_list(string):
    set_str_digits = set([str(digit) for digit in range(10)])
    numbers_list = []
    found_string = ''
    alert = False
    new_number = False
    
    for i,char in enumerate(string):
        if char in set_str_digits:
            new_number = True
            found_string += char
        elif new_number == True:
            numbers_list.append(int(found_string))
            new_number = False
            found_string = ''
    if new_number == True:
        numbers_list.append(int(found_string))
    return numbers_list

def get_year_volume_issue_firstPage(paper):
    try:
        year = paper['year']
    except:
        year = -1
    numbers_list = get_numbers_list(paper['journalVolume'])
    try:
        volume = numbers_list[0]
    except:
        volume = -1
    try:
        issue = numbers_list[1]
    except:
        issue = -1
    numbers_list = get_numbers_list(paper['journalPages'])
    try:
        firstPage = numbers_list[0]
    except:
        firstPage = -1
    
    return (year,volume,issue,firstPage)

In [4]:
# prune text from punctuation and junk...
wnl = WordNetLemmatizer()
pattern = re.compile(r'\B#\w*[A-Za-z]+\w*|\b\w*[A-Za-z]+\w*', re.UNICODE)
def lemmatize(doc):
    '''
        Takes a string doc and returns the list of words, without punctuation and junk
    '''
    l = [wnl.lemmatize(t) for t in pattern.findall(doc)]
    return [w.lower() for w in l if len(w) > 1]

In [None]:
try:
    raise KeyError
    print('Trying to load collected data...')
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_singles_set.pkl.gz'), 'rb') as fp:
        all_fieldsOfStudy_singles_set = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_couples_set.pkl.gz'), 'rb') as fp:
        all_fieldsOfStudy_couples_set = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_tuples_set.pkl.gz'), 'rb') as fp:
        all_fieldsOfStudy_tuples_set = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'count_by_fieldsOfStudy_tuple_all_journalName_dict.pkl.gz'), 'rb') as fp:
        count_by_fieldsOfStudy_tuple_all_journalName_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'count_by_fieldsOfStudy_couple_all_journalName_dict.pkl.gz'), 'rb') as fp:
        count_by_fieldsOfStudy_couple_all_journalName_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'count_by_fieldsOfStudy_single_all_journalName_dict.pkl.gz'), 'rb') as fp:
        count_by_fieldsOfStudy_single_all_journalName_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'count_all_journalName_dict.pkl.gz'), 'rb') as fp:
        count_all_journalName_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'sample_all_journalName_dict.pkl.gz'), 'rb') as fp:
        sample_all_journalName_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_journalName_set.pkl.gz'), 'rb') as fp:
        all_journalName_set = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_titles_dict.pkl.gz'), 'rb') as fp:
        all_titles_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_journal_dict.pkl.gz'), 'rb') as fp:
        all_journal_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_dict.pkl.gz'), 'rb') as fp:
        all_fieldsOfStudy_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_year_volume_issue_firstPage_dict.pkl.gz'), 'rb') as fp:
        all_year_volume_issue_firstPage_dict = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_paperId_by_journal_by_first_fieldsOfStudy.pkl.gz'), 'rb') as fp:
        all_paperId_by_journal_by_first_fieldsOfStudy = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'all_paperId_by_author.pkl.gz'), 'rb') as fp:
        all_paperId_by_author = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'count_papers_by_author.pkl.gz'), 'rb') as fp:
        count_papers_by_author = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'count_papers_by_author_by_first_fieldsOfStudy.pkl.gz'), 'rb') as fp:
        count_papers_by_author_by_first_fieldsOfStudy = joblib.load(fp)
    print('Loaded all data')
except:    
    print('Collecting data...')
    start = datetime.now()
    sample_all_journalName_dict = {}
    all_titles_dict = {}
    all_fieldsOfStudy_dict = {}
    all_journal_dict = {}
    all_year_volume_issue_firstPage_dict = {}
    all_paperId_by_journal_by_first_fieldsOfStudy = {}
    all_paperId_by_author = {}
    count_papers_by_author = {}
    count_papers_by_author_by_first_fieldsOfStudy = {}
    count_all_journalName_dict = {}
    count_by_fieldsOfStudy_tuple_all_journalName_dict = {}
    count_by_fieldsOfStudy_couple_all_journalName_dict = {}
    count_by_fieldsOfStudy_single_all_journalName_dict = {}
    all_fieldsOfStudy_tuples_set = set()
    all_fieldsOfStudy_couples_set = set()
    all_fieldsOfStudy_singles_set = set()
    all_journalName_set = set()
    for ID in tqdm(range(6000)):
        filename = os.path.join(corpus_folder,'s2-corpus-%.3d.gz'%ID)
        with gzip.open(filename, 'rb') as f:
            for paper in jsonlines.Reader(f):
                # each line of the file is a dictionary with the paper's info
                paper_id = paper['id']
                title = paper['title'].lower()
                # ONLY CONSIDER THE ENGLISH ONES
                language = cld3.get_language(title).language # detector.detect(title).lang # from googletrans import Translator # detector = Translator()
#                 if language_prediction.is_reliable == True: # probability > 0.99:
#                     language = language_prediction.language
                if language != 'en':
                    continue
                language = languages.get(alpha2=language[:2]).name # 
                all_titles_dict[paper_id] = title
                journalName = paper['journalName'].lower()
                venue = paper['venue'].lower()
                fieldsOfStudy = tuple(paper['fieldsOfStudy'])
                if len(fieldsOfStudy) > 0:
                    first_fieldsOfStudy = fieldsOfStudy[0]

                for author_dict in paper['authors']:
                    try: 
                        author_id = author_dict['ids'][0]
                    except:
                        continue
                    if author_id not in all_paperId_by_author:
                        all_paperId_by_author[author_id] = set([paper_id])
                        count_papers_by_author[author_id] = 1
                    else:
                        all_paperId_by_author[author_id].add(paper_id)
                        count_papers_by_author[author_id] += 1
                    if len(fieldsOfStudy) > 0:
                        if first_fieldsOfStudy not in count_papers_by_author_by_first_fieldsOfStudy:
                            count_papers_by_author_by_first_fieldsOfStudy[first_fieldsOfStudy] = {author_id:1}
                        elif author_id not in count_papers_by_author_by_first_fieldsOfStudy[first_fieldsOfStudy]:
                            count_papers_by_author_by_first_fieldsOfStudy[first_fieldsOfStudy][author_id] = 1
                        else:
                            count_papers_by_author_by_first_fieldsOfStudy[first_fieldsOfStudy][author_id] += 1

                all_year_volume_issue_firstPage_dict[paper_id] = get_year_volume_issue_firstPage(paper)
    
                if len(fieldsOfStudy) > 0:
                    all_fieldsOfStudy_dict[paper_id] = fieldsOfStudy
                    
                if len(fieldsOfStudy) > 0 and (len(venue) > 0 or len(journalName) > 0):
                    if (journalName,venue) not in sample_all_journalName_dict:
                        sample_all_journalName_dict[(journalName,venue)] = paper.copy()
                        count_all_journalName_dict[(journalName,venue)] = 1
                        all_journalName_set.add((journalName,venue))
                    else:
                        count_all_journalName_dict[(journalName,venue)] += 1
                    all_journal_dict[paper_id] = (journalName,venue)

                    # TUPLE
                    all_fieldsOfStudy_tuples_set.add(fieldsOfStudy)
                    if fieldsOfStudy not in count_by_fieldsOfStudy_tuple_all_journalName_dict:
                        count_by_fieldsOfStudy_tuple_all_journalName_dict[fieldsOfStudy] = {(journalName,venue):1}
                    elif (journalName,venue) not in count_by_fieldsOfStudy_tuple_all_journalName_dict[fieldsOfStudy]:
                        count_by_fieldsOfStudy_tuple_all_journalName_dict[fieldsOfStudy][(journalName,venue)] = 1
                    else:
                        count_by_fieldsOfStudy_tuple_all_journalName_dict[fieldsOfStudy][(journalName,venue)] += 1

                    # COUPLE
                    fieldsOfStudy = fieldsOfStudy[:2]
                    all_fieldsOfStudy_couples_set.add(fieldsOfStudy)
                    if fieldsOfStudy not in count_by_fieldsOfStudy_couple_all_journalName_dict:
                        count_by_fieldsOfStudy_couple_all_journalName_dict[fieldsOfStudy] = {(journalName,venue):1}
                    elif (journalName,venue) not in count_by_fieldsOfStudy_couple_all_journalName_dict[fieldsOfStudy]:
                        count_by_fieldsOfStudy_couple_all_journalName_dict[fieldsOfStudy][(journalName,venue)] = 1
                    else:
                        count_by_fieldsOfStudy_couple_all_journalName_dict[fieldsOfStudy][(journalName,venue)] += 1

                    # SINGLE
                    fieldsOfStudy = fieldsOfStudy[0]
                    all_fieldsOfStudy_singles_set.add(fieldsOfStudy)
                    if fieldsOfStudy not in count_by_fieldsOfStudy_single_all_journalName_dict:
                        count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy] = {(journalName,venue):1}
                        all_paperId_by_journal_by_first_fieldsOfStudy[fieldsOfStudy] = {(journalName,venue):set([paper_id])}
                    elif (journalName,venue) not in count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy]:
                        count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy][(journalName,venue)] = 1
                        all_paperId_by_journal_by_first_fieldsOfStudy[fieldsOfStudy][(journalName,venue)] = set([paper_id])
                    else:
                        count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy][(journalName,venue)] += 1
                        all_paperId_by_journal_by_first_fieldsOfStudy[fieldsOfStudy][(journalName,venue)].add(paper_id)

        end = datetime.now()

        if (ID+1) % 1000 == 0:
            print(f'Read {ID+1} files after {end-start}.',flush=True)

    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_singles_set.pkl.gz'), 'wb') as fp:
        joblib.dump(all_fieldsOfStudy_singles_set,fp)
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_couples_set.pkl.gz'), 'wb') as fp:
        joblib.dump(all_fieldsOfStudy_couples_set,fp)
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_tuples_set.pkl.gz'), 'wb') as fp:
        joblib.dump(all_fieldsOfStudy_tuples_set,fp)
    with gzip.open(os.path.join(data_folder, 'count_by_fieldsOfStudy_tuple_all_journalName_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(count_by_fieldsOfStudy_tuple_all_journalName_dict,fp)
    with gzip.open(os.path.join(data_folder, 'count_by_fieldsOfStudy_couple_all_journalName_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(count_by_fieldsOfStudy_couple_all_journalName_dict,fp)
    with gzip.open(os.path.join(data_folder, 'count_by_fieldsOfStudy_single_all_journalName_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(count_by_fieldsOfStudy_single_all_journalName_dict,fp)
    with gzip.open(os.path.join(data_folder, 'count_all_journalName_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(count_all_journalName_dict,fp)
    with gzip.open(os.path.join(data_folder, 'sample_all_journalName_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(sample_all_journalName_dict,fp)
    with gzip.open(os.path.join(data_folder, 'all_journalName_set.pkl.gz'), 'wb') as fp:
        joblib.dump(all_journalName_set,fp)
    with gzip.open(os.path.join(data_folder, 'all_titles_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(all_titles_dict,fp)
    with gzip.open(os.path.join(data_folder, 'all_journal_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(all_journal_dict,fp)
    with gzip.open(os.path.join(data_folder, 'all_fieldsOfStudy_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(all_fieldsOfStudy_dict,fp)
    with gzip.open(os.path.join(data_folder, 'all_year_volume_issue_firstPage_dict.pkl.gz'), 'wb') as fp:
        joblib.dump(all_year_volume_issue_firstPage_dict,fp)
    with gzip.open(os.path.join(data_folder, 'all_paperId_by_journal_by_first_fieldsOfStudy.pkl.gz'), 'wb') as fp:
        joblib.dump(all_paperId_by_journal_by_first_fieldsOfStudy,fp)
    with gzip.open(os.path.join(data_folder, 'all_paperId_by_author.pkl.gz'), 'wb') as fp:
        joblib.dump(all_paperId_by_author,fp)
    with gzip.open(os.path.join(data_folder, 'count_papers_by_author.pkl.gz'), 'wb') as fp:
        joblib.dump(count_papers_by_author,fp)
    with gzip.open(os.path.join(data_folder, 'count_papers_by_author_by_first_fieldsOfStudy.pkl.gz'), 'wb') as fp:
        joblib.dump(count_papers_by_author_by_first_fieldsOfStudy,fp)
    print('Dumped all collected data')

Remap each word in the dataset into increasing integer indexesconfidence

In [6]:
with gzip.open(os.path.join(data_folder, 'all_titles_dict.pkl.gz'), 'rb') as fp:
        all_titles_dict = joblib.load(fp)
    

In [7]:
try:
    raise KeyError
    print('Trying to load word2index and word2stem mapping...')
    with gzip.open(os.path.join(data_folder, 'word2index.pkl.gz'), 'rb') as fp:
        word2index = joblib.load(fp)
    with gzip.open(os.path.join(data_folder, 'word2stem.pkl.gz'), 'rb') as fp:
        word2stem = joblib.load(fp)
    print('Loaded word2index and word2stem mapping')
except:
    print('Computing word2index and word2stem mapping...')
    word2index = {}
    max_index = 0
    word2stem = {}
    stem2index = {}
    max_stem_index = 0
    for title in tqdm(all_titles_dict.values()):
#         language_prediction = cld3.get_language(title) # technically now they are all english titles, so no need for this
#         if language_prediction.probability > 0.5: # probability > 0.99: # is_reliable == True:
#             language = language_prediction.language # This gives a iso639 code
#             language = languages.get(alpha2=language).name # get the full name of the language from its iso639 code
#         else:
#             print(title,language_prediction, language)
#             break
        words = lemmatize(title)
        snow_stemmer = SnowballStemmer(language=language.lower())
        for word in words:
            if word not in word2index:
                word2index[word] = max_index
                max_index += 1
                stem = snow_stemmer.stem(word)
                word2stem[word] = stem
                if stem not in stem2index:
                    stem2index[stem] = max_stem_index
                    max_stem_index += 1
    # Dump
    with gzip.open(os.path.join(data_folder, 'word2index.pkl.gz'), 'wb') as fp:
        joblib.dump(word2index,fp)
    with gzip.open(os.path.join(data_folder, 'word2stem.pkl.gz'), 'wb') as fp:
        joblib.dump(word2stem,fp)
    with gzip.open(os.path.join(data_folder, 'stem2index.pkl.gz'), 'wb') as fp:
        joblib.dump(stem2index,fp)
    print('Dumped mappings')

Computing word2index and word2stem mapping...


  0%|          | 0/130499129 [00:00<?, ?it/s]

Dumped mappings


# Find most important journals

In [None]:
all_fieldsOfStudy_tuples_df = {}
for fieldsOfStudy in all_fieldsOfStudy_tuples_set:
    tmp_df = pd.DataFrame.from_dict(count_by_fieldsOfStudy_tuple_all_journalName_dict[fieldsOfStudy],orient="index",columns=[fieldsOfStudy])
    all_fieldsOfStudy_tuples_df[fieldsOfStudy] = tmp_df[pd.notna(tmp_df)][fieldsOfStudy].sort_values(ascending=False)

In [None]:
all_fieldsOfStudy_tuples_df[fieldsOfStudy].head()

In [None]:
all_fieldsOfStudy_couples_df = {}
for fieldsOfStudy in all_fieldsOfStudy_couples_set:
    tmp_df = pd.DataFrame.from_dict(count_by_fieldsOfStudy_couple_all_journalName_dict[fieldsOfStudy],orient="index",columns=[fieldsOfStudy])
    all_fieldsOfStudy_couples_df[fieldsOfStudy] = tmp_df[pd.notna(tmp_df)][fieldsOfStudy].sort_values(ascending=False)

In [None]:
all_fieldsOfStudy_couples_df[fieldsOfStudy].head()

In [None]:
all_fieldsOfStudy_singles_df = {}
for fieldsOfStudy in all_fieldsOfStudy_singles_set:
    tmp_df = pd.DataFrame.from_dict(count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy],orient="index",columns=[fieldsOfStudy])
    all_fieldsOfStudy_singles_df[fieldsOfStudy] = tmp_df[pd.notna(tmp_df)][fieldsOfStudy].sort_values(ascending=False)

In [None]:
all_fieldsOfStudy_singles_df[fieldsOfStudy].head()

In [None]:
fieldsOfStudy = sorted(all_fieldsOfStudy_singles_set)[0]
df = pd.DataFrame.from_dict(count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy],orient="index",columns=[fieldsOfStudy])
tmp_df2 = df.copy()
for fieldsOfStudy in sorted(all_fieldsOfStudy_singles_set)[1:]:
    tmp_df = pd.DataFrame.from_dict(count_by_fieldsOfStudy_single_all_journalName_dict[fieldsOfStudy],orient="index",columns=[fieldsOfStudy])
    df = df.append(tmp_df,sort=True)
all_fields_df = df.sum(axis=1,numeric_only=True).sort_values(ascending=False)
all_fields_df.head(50)

In [None]:
tmp_df = pd.DataFrame.from_dict(count_all_journalName_dict,orient="index",columns=["All fields"])
all_fields_df = tmp_df[pd.notna(tmp_df)]["All fields"].sort_values(ascending=False)

In [None]:
all_fields_df.head()

In [None]:
for (journalName, venue) in all_fields_df.head(10).index:
    print(f'---------- {journalName} ----------')
    print('\n')
    print('ID:\t',sample_all_journalName_dict[(journalName, venue)]['id'])
    print('TITLE:\t',sample_all_journalName_dict[(journalName, venue)]['title'])
    print('YEAR:\t',sample_all_journalName_dict[(journalName, venue)]['year'])
    print('journalVolume:\t',sample_all_journalName_dict[(journalName, venue)]['journalVolume'])
    print('journalPages:\t',sample_all_journalName_dict[(journalName, venue)]['journalPages'])
    print('\n')

# Preparing list of titles

In [None]:
try:
    with open(os.path.join(data_folder,'all_fieldsOfStudy.tsv'), 'r', newline='\n') as fp:
        tsv_output = csv.reader(fp, delimiter='\n')
        all_fieldsOfStudy = [] 
        for _ in tsv_output:
            all_fieldsOfStudy.append(_[0])
except:
    all_fieldsOfStudy = sorted(all_fieldsOfStudy_singles_set)
    with open(os.path.join(data_folder,'all_fieldsOfStudy.tsv'), 'w', newline='\n') as fp:
        tsv_output = csv.writer(fp, delimiter='\n')
        tsv_output.writerow(all_fieldsOfStudy)
print(all_fieldsOfStudy)

## By field of study and journal

For each fields of study, take the 100 first journals based on the number of papers in that field and journal. 

Then, for each field and journal, create a list of tokenized words from the time-ordered list of titles, and save each list, where each word is remapped into an index.

In [71]:
# os.makedirs(os.path.join(data_folder,'fieldsOfStudy_original'), exist_ok = True)

In [73]:
journals_fieldsOfStudy_name_dict = {}
num_journals = 1000
for fieldOfStudy in tqdm(all_fieldsOfStudy):
    print('Starting', fieldOfStudy)
    journals = list(all_fieldsOfStudy_singles_df[fieldOfStudy].index[:num_journals])
    fieldsOfStudy_original_folder = os.path.join(data_folder,'journals_fieldsOfStudy_original',fieldOfStudy)
    os.makedirs(fieldsOfStudy_original_folder, exist_ok = True)
    fieldsOfStudy_folder = os.path.join(data_folder,'journals_fieldsOfStudy',fieldOfStudy)
    os.makedirs(fieldsOfStudy_folder, exist_ok = True)
    fieldsOfStudy_stems_folder = os.path.join(data_folder,'journals_fieldsOfStudy_stems',fieldOfStudy)
    os.makedirs(fieldsOfStudy_stems_folder, exist_ok = True)
    papers_with_no_year = []
    ordered_journal_index = -1 
    for (journal,venue) in tqdm(journals):
        ordered_journal_index += 1
        # find all papers of the journal in the considered fieldOfStudy
        list_to_sort = []
        for paper_id in all_paperId_by_journal_by_first_fieldsOfStudy[fieldOfStudy][(journal,venue)]:
            if all_year_volume_issue_firstPage_dict[paper_id][0] is not None:
                list_to_sort.append(all_year_volume_issue_firstPage_dict[paper_id] + tuple([paper_id]))
#             else:
#                 papers_with_no_year.append(paper_id)
#         # see if there is any other paper with the same volume and copy its year
#         if len(papers_with_no_year) > 0:
#             print('Some paper do not have year key, trying to solve')
#             for paper_id in tqdm(papers_with_no_year):
#                 volume = all_year_volume_issue_firstPage_dict[paper_id][1]
#                 issue = all_year_volume_issue_firstPage_dict[paper_id][2]
#                 firstPage = all_year_volume_issue_firstPage_dict[paper_id][3]
#                 if volume != -1:
#                     for tuple_paper in list_to_sort:
#                         if tuple_paper[1] == volume:
#                             list_to_sort.append((tuple_paper[0], volume, issue, firstPage, paper_id))
        # sort it using year,volume,issue,firstPage,paper_id
        sorted_list = sorted(list_to_sort)
        # create list of titles
        ordered_list_of_words = []
        for tmp_tuple in sorted_list:
            paper_id = tmp_tuple[-1]
            ordered_list_of_words += lemmatize(all_titles_dict[paper_id])
        with open(os.path.join(fieldsOfStudy_original_folder,f'{ordered_journal_index}.tsv'), 'w', newline='\n') as fp:
            tsv_output = csv.writer(fp, delimiter='\n')
            tsv_output.writerow(ordered_list_of_words)
        indexed_ordered_list_of_words = [word2index[key] for key in ordered_list_of_words]
        with open(os.path.join(fieldsOfStudy_folder,f'{ordered_journal_index}.tsv'), 'w', newline='\n') as fp:
            tsv_output = csv.writer(fp, delimiter='\n')
            tsv_output.writerow(indexed_ordered_list_of_words)
        indexed_ordered_list_of_stems = [stem2index[word2stem[key]] for key in ordered_list_of_words]
        with open(os.path.join(fieldsOfStudy_stems_folder,f'{ordered_journal_index}.tsv'), 'w', newline='\n') as fp:
            tsv_output = csv.writer(fp, delimiter='\n')
            tsv_output.writerow(indexed_ordered_list_of_stems)
    journals_fieldsOfStudy_name_dict[fieldOfStudy] = journals.copy()
    with open(os.path.join(data_folder,f'journals_fieldsOfStudy_name.pkl'), 'wb') as fp:
        joblib.dump(journals_fieldsOfStudy_name_dict, fp)

  0%|          | 0/19 [00:00<?, ?it/s]

Starting Art


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Biology


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Business


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Chemistry


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Computer Science


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Economics


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Engineering


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Environmental Science


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Geography


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Geology


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting History


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Materials Science


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Mathematics


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Medicine


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Philosophy


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Physics


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Political Science


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Psychology


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting Sociology


  0%|          | 0/1000 [00:00<?, ?it/s]

## By authors

Take the 10000 first authors based on their number of papers. 

Then, for each of them, create a list of tokenized words from the time-ordered list of titles, and save each list, where each word is remapped into an index.

In [74]:
count_papers_by_author_df = pd.DataFrame.from_dict(count_papers_by_author,orient="index",columns=['no_papers'])
ordered_count_papers_by_author_df = count_papers_by_author_df[pd.notna(count_papers_by_author_df)].no_papers.sort_values(ascending=False)

In [75]:
num_authors = 10000
authors = list(ordered_count_papers_by_author_df.index[:num_authors])
authors_original_folder = os.path.join(data_folder,'authors_original')
os.makedirs(authors_original_folder, exist_ok = True)
authors_folder = os.path.join(data_folder,'authors')
os.makedirs(authors_folder, exist_ok = True)
authors_stems_folder = os.path.join(data_folder,'authors_stems')
os.makedirs(authors_stems_folder, exist_ok = True)
with open(os.path.join(data_folder,f'authors_name.pkl'), 'wb') as fp:
    joblib.dump(authors, fp)

In [77]:
for ordered_author_index,author in tqdm(enumerate(authors)):
    # find all papers of the author
    list_to_sort = []
    for paper_id in all_paperId_by_author[author]:
        info = all_year_volume_issue_firstPage_dict[paper_id]
        if info[0] is not None:
            list_to_sort.append(info + tuple([paper_id]))
    # sort it using year,volume,issue,firstPage,paper_id
    sorted_list = sorted(list_to_sort)
    # create list of titles
    ordered_list_of_words = []
    for tmp_tuple in sorted_list:
        paper_id = tmp_tuple[-1]
        ordered_list_of_words += lemmatize(all_titles_dict[paper_id])
    with open(os.path.join(authors_original_folder,f'{ordered_author_index}.tsv'), 'w', newline='\n') as fp:
        tsv_output = csv.writer(fp, delimiter='\n')
        tsv_output.writerow(ordered_list_of_words)
    indexed_ordered_list_of_words = [word2index[key] for key in ordered_list_of_words]
    with open(os.path.join(authors_folder,f'{ordered_author_index}.tsv'), 'w', newline='\n') as fp:
        tsv_output = csv.writer(fp, delimiter='\n')
        tsv_output.writerow(indexed_ordered_list_of_words)
    indexed_ordered_list_of_stems = [stem2index[word2stem[key]] for key in ordered_list_of_words]
    with open(os.path.join(authors_stems_folder,f'{ordered_author_index}.tsv'), 'w', newline='\n') as fp:
        tsv_output = csv.writer(fp, delimiter='\n')
        tsv_output.writerow(indexed_ordered_list_of_stems)

0it [00:00, ?it/s]

## By field of study and author

For each fields of study, take the 1000 first authors based on their number of papers in that field. 

Then, for each of these authors, create a list of tokenized words from the time-ordered list of titles they have written (considering all papers, including other fields), and save each list, where each word is remapped into an index.

In [80]:
authors_fieldsOfStudy_name_dict = {}
num_authors = 1000
for fieldOfStudy in tqdm(all_fieldsOfStudy):
    print('Starting', fieldOfStudy)
    
    tmp_count_papers_by_author_df = pd.DataFrame.from_dict(count_papers_by_author_by_first_fieldsOfStudy[fieldOfStudy],orient="index",columns=['no_papers'])
    ordered_tmp_count_papers_by_author_df = count_papers_by_author_df[pd.notna(tmp_count_papers_by_author_df)].no_papers.sort_values(ascending=False)
    
    authors = list(ordered_tmp_count_papers_by_author_df.index[:num_authors])
    authors_original_folder = os.path.join(data_folder,'authors_fieldsOfStudy_original',fieldOfStudy)
    os.makedirs(authors_original_folder, exist_ok = True)
    authors_folder = os.path.join(data_folder,'authors_fieldsOfStudy',fieldOfStudy)
    os.makedirs(authors_folder, exist_ok = True)
    authors_stems_folder = os.path.join(data_folder,'authors_fieldsOfStudy_stems',fieldOfStudy)
    os.makedirs(authors_stems_folder, exist_ok = True)
    for ordered_author_index,author in tqdm(enumerate(authors)):
        # find all papers of the author
        list_to_sort = []
        for paper_id in all_paperId_by_author[author]:
            info = all_year_volume_issue_firstPage_dict[paper_id]
            if info[0] is not None:
                list_to_sort.append(info + tuple([paper_id]))
        # sort it using year,volume,issue,firstPage,paper_id
        sorted_list = sorted(list_to_sort)
        # create list of titles
        ordered_list_of_words = []
        for tmp_tuple in sorted_list:
            paper_id = tmp_tuple[-1]
            ordered_list_of_words += lemmatize(all_titles_dict[paper_id])
        with open(os.path.join(authors_original_folder,f'{ordered_author_index}.tsv'), 'w', newline='\n') as fp:
            tsv_output = csv.writer(fp, delimiter='\n')
            tsv_output.writerow(ordered_list_of_words)
        indexed_ordered_list_of_words = [word2index[key] for key in ordered_list_of_words]
        with open(os.path.join(authors_folder,f'{ordered_author_index}.tsv'), 'w', newline='\n') as fp:
            tsv_output = csv.writer(fp, delimiter='\n')
            tsv_output.writerow(indexed_ordered_list_of_words)
        indexed_ordered_list_of_stems = [stem2index[word2stem[key]] for key in ordered_list_of_words]
        with open(os.path.join(authors_stems_folder,f'{ordered_author_index}.tsv'), 'w', newline='\n') as fp:
            tsv_output = csv.writer(fp, delimiter='\n')
            tsv_output.writerow(indexed_ordered_list_of_stems)
    authors_fieldsOfStudy_name_dict[fieldOfStudy] = authors.copy()
    with open(os.path.join(data_folder,f'authors_fieldsOfStudy_name.pkl'), 'wb') as fp:
        joblib.dump(authors_fieldsOfStudy_name_dict, fp)

  0%|          | 0/19 [00:00<?, ?it/s]

Starting Art


0it [00:00, ?it/s]

Starting Biology


0it [00:00, ?it/s]

Starting Business


0it [00:00, ?it/s]

Starting Chemistry


0it [00:00, ?it/s]

Starting Computer Science


0it [00:00, ?it/s]

Starting Economics


0it [00:00, ?it/s]

Starting Engineering


0it [00:00, ?it/s]

Starting Environmental Science


0it [00:00, ?it/s]

Starting Geography


0it [00:00, ?it/s]

Starting Geology


0it [00:00, ?it/s]

Starting History


0it [00:00, ?it/s]

Starting Materials Science


0it [00:00, ?it/s]

Starting Mathematics


0it [00:00, ?it/s]

Starting Medicine


0it [00:00, ?it/s]

Starting Philosophy


0it [00:00, ?it/s]

Starting Physics


0it [00:00, ?it/s]

Starting Political Science


0it [00:00, ?it/s]

Starting Psychology


0it [00:00, ?it/s]

Starting Sociology


0it [00:00, ?it/s]