# Generate topics using abstracts and keywords extracted from papers


Once keywords are extracted, this is an example on how to extract targeted topics (from these keywords) and associated different abstracts to the closest topics. In this example, I only focus in keywords of 3-grams 


In [1]:
import os 
import json
from collections import Counter


NEURIPS_ANALYSIS_DATA_PATH = '/var/data/neurips_analysis'

with open(os.path.join(NEURIPS_ANALYSIS_DATA_PATH, 'year_keywords_counter_n3.json'), 'r') as f:
    keywords_n3 = json.load(f)

print(f'keywords loaded -- 3-grams: {sum([len(v) for k, v in keywords_n3.items()])}')


all_n3_keywords = Counter()
for year, values in keywords_n3.items():
    all_n3_keywords.update([v[0] for v in values.items()])
selected_n3_keywords = [x[0] for x in all_n3_keywords.items() if x[1] > 1]
len(selected_n3_keywords)



keywords loaded -- 3-grams: 120341


2116

In [2]:
from collections import defaultdict

from tqdm.notebook import tqdm 

from papeles.paper.neurips import get_key
from papeles.utils import text as text_utils
from papeles.utils.topics import Topics

In [3]:
metadata_path = os.path.join(NEURIPS_ANALYSIS_DATA_PATH, 'files_metadata/')

metadata = {}
for filename in tqdm(os.listdir(metadata_path), 'loading metadata'):
    with open(os.path.join(metadata_path, filename), 'r') as f:  # open in readonly mode
        for line in f.readlines():
            data = json.loads(line)
            metadata[get_key(data['pdf_name'])] = data


HBox(children=(FloatProgress(value=0.0, description='loading metadata', max=6083.0, style=ProgressStyle(descriâ€¦




## Topics extraction and topics per year


Topics are generated using the whole corpus (all abstracts from all years). Then, all topics are counter per year to generate a final analysis of which topics are the most common per year.  

In [4]:
topics = Topics([x['abstract'] for x in metadata.values()], selected_n3_keywords)
topics.topics

{'topic_0': ['upper_lower_bounds',
  'generalization_error_bound',
  'restricted_strong_convexity',
  'optimizes_objective_function',
  'satisfies_restricted_strong',
  'function_satisfies_restricted',
  'orthogonal_matching_pursuit',
  'problem_learning_control',
  'superposition-structured_dirty_statistical',
  'contextual_bandits_learner'],
 'topic_1': ['machine_learning_models',
  'online_learning_algorithm',
  'stochastic_convex_optimization',
  'learning_conditional_random',
  'optimal_convergence_rates',
  'standing_pursuit_machine',
  'pursuit_machine_learning',
  'long_standing_pursuit',
  'optimization_algorithms_popular',
  'learning_structured_predictors'],
 'topic_2': ['high_dimensional_datasets',
  'low_dimensional_structures',
  'paper_concerns_problem',
  'problem_learning_control',
  'superposition-structured_dirty_statistical',
  'contextual_bandits_learner',
  'concerns_problem_learning',
  'low-rank_tensor_decomposition',
  'dynamic_assortment_selection',
  'posteri

In [5]:
topics_per_year = {}
for _, data in metadata.items():
    year = data['year']
    if year not in topics_per_year:
        topics_per_year[year] = Counter()
    
    topic_prediction = topics.predict_topics(data['abstract'])
    if sum(topic_prediction.values()) > 0:
        top_prediction = [x[0] for x in sorted(topic_prediction.items(), key=lambda x: x[1], reverse=True) if x[1] > 0][:5]
        topics_per_year[year].update(top_prediction)

topics_per_year

{2019: Counter({'topic_40': 24,
          'topic_47': 4,
          'topic_11': 13,
          'topic_31': 4,
          'topic_42': 8,
          'topic_9': 8,
          'topic_17': 13,
          'topic_25': 19,
          'topic_18': 5,
          'topic_0': 13,
          'topic_55': 98,
          'topic_13': 102,
          'topic_43': 114,
          'topic_49': 85,
          'topic_26': 46,
          'topic_28': 2,
          'topic_27': 10,
          'topic_5': 7,
          'topic_52': 30,
          'topic_1': 29,
          'topic_6': 44,
          'topic_30': 63,
          'topic_36': 6,
          'topic_14': 14,
          'topic_32': 18,
          'topic_3': 22,
          'topic_12': 26,
          'topic_37': 28,
          'topic_46': 25,
          'topic_44': 37,
          'topic_34': 39,
          'topic_29': 5,
          'topic_38': 5,
          'topic_51': 45,
          'topic_19': 6,
          'topic_7': 5,
          'topic_4': 5,
          'topic_22': 7,
          'topic_33': 7,
 

In [6]:
for year in range(2009, 2020):
    print(f'===============\n{year}')
    topics = topics_per_year[year]
    print(sorted(topics.items(), key=lambda x: x[1], reverse=True)[:5])
    

2009
[('topic_16', 9), ('topic_53', 6), ('topic_25', 5), ('topic_1', 5), ('topic_12', 5)]
2010
[('topic_30', 8), ('topic_46', 6), ('topic_16', 6), ('topic_42', 6), ('topic_9', 5)]
2011
[('topic_30', 9), ('topic_16', 7), ('topic_0', 6), ('topic_6', 6), ('topic_44', 5)]
2012
[('topic_30', 10), ('topic_19', 9), ('topic_29', 7), ('topic_45', 7), ('topic_0', 7)]
2013
[('topic_30', 22), ('topic_12', 14), ('topic_6', 13), ('topic_32', 11), ('topic_37', 11)]
2014
[('topic_51', 18), ('topic_43', 15), ('topic_55', 15), ('topic_32', 13), ('topic_13', 13)]
2015
[('topic_43', 24), ('topic_13', 23), ('topic_55', 19), ('topic_30', 18), ('topic_51', 15)]
2016
[('topic_43', 43), ('topic_13', 39), ('topic_49', 29), ('topic_30', 26), ('topic_51', 21)]
2017
[('topic_13', 48), ('topic_43', 47), ('topic_55', 43), ('topic_49', 38), ('topic_30', 33)]
2018
[('topic_43', 88), ('topic_13', 81), ('topic_49', 68), ('topic_55', 66), ('topic_30', 43)]
2019
[('topic_43', 114), ('topic_13', 102), ('topic_55', 98), ('t