In [103]:
import csv
import tqdm
import copy
import nltk
nltk.download('stopwords')
import collections
from operator import itemgetter

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/grzegorz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
import csv


def read_articles():
    articles = []
    with open('DBLP-citation-Jan8.txt') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='#')
        for row in csv_reader:
            if len(row) >= 2:
                data = row[1]
                if len(data)<2:
                    continue
                elif data[:1] == '*':
                    article = {}
                    article['quoted'] = []
                    article['title'] = data[1:]
                elif data[:1] == '@':
                    article['authors'] = data[1:]
                elif data[:1] == 't':
                    article['date'] = data[1:]
                elif data[:1] == 'c':
                    article['source'] = data[1:]
                elif data[:5] == 'index':
                    article['index'] = data[5:]
                    articles.append(article)
                elif data[:1] == '%':
                    if 'quoted' in article:
                        article['quoted'].append(data[1:])
                    else:
                        article['quoted'].append(data[1:])
                elif data[:1] == '!':
                    article['abstract'] = data[1:]
                else:
                    continue
    return articles

In [89]:
articles = read_articles()

In [90]:
def filter_articles_without_abstract(articles):
    return [article for article in articles if 'abstract' in article and 'authors' in article]

In [91]:
from nltk.corpus import stopwords
def tokenize_and_remove_step_words_from_abstract(articles):
    tokenizer = nltk.tokenize.SpaceTokenizer()
    stop_words_set = set(stopwords.words('english'))
    articles_copy = copy.deepcopy(articles)
    for article in tqdm.tqdm(articles_copy):
        try:
            article['abstract'] = tokenizer.tokenize(article['abstract'])
            article['abstract'] = [word for word in article['abstract'] if word not in stop_words_set]
        except AttributeError:
            pass
    return articles_copy

In [92]:
articles_with_abstract = filter_articles_without_abstract(articles)

In [93]:
articles_tokenized = tokenize_and_remove_step_words_from_abstract(articles_with_abstract)

100%|██████████| 529247/529247 [00:11<00:00, 46957.77it/s]


In [94]:
articles_tokenized[:10]

[{'quoted': ['165'],
  'title': 'Spatial Data Structures.',
  'authors': 'Hanan Samet',
  'date': '1995',
  'source': 'Modern Database Systems',
  'index': '25',
  'abstract': ['An',
   'overview',
   'presented',
   'use',
   'spatial',
   'data',
   'structures',
   'spatial',
   'databases.',
   'The',
   'focus',
   'hierarchical',
   'data',
   'structures,',
   'including',
   'number',
   'variants',
   'quadtrees,',
   'sort',
   'data',
   'respect',
   'space',
   'occupied',
   'it.',
   'Such',
   'techniques',
   'known',
   'spatial',
   'indexing',
   'methods.',
   'Hierarchical',
   'data',
   'structures',
   'based',
   'principle',
   'recursive',
   'decomposition.',
   'They',
   'attractive',
   'compact',
   'depending',
   'nature',
   'data',
   'save',
   'space',
   'well',
   'time',
   'also',
   'facilitate',
   'operations',
   'search.',
   'Examples',
   'given',
   'use',
   'data',
   'structures',
   'representation',
   'different',
   'data',
   '

In [95]:
def filter_least_frequent_authors(articles, top=1000):
    authors_frequency = collections.defaultdict(int)
    for article in tqdm.tqdm(articles):
        authors = article['authors'].split(",")
        for author in authors:
            authors_frequency[author]+=1
    top_authors = list(sorted(authors_frequency.items(), key=itemgetter(1),reverse=True))
    top_authors = top_authors[:top]
    top_authors_without_occurences = [el[0] for el in top_authors]
    return top_authors_without_occurences

top_authors = filter_least_frequent_authors(articles_tokenized)

100%|██████████| 529247/529247 [00:01<00:00, 497984.31it/s]


In [96]:
def filter_articles_containing_only_top_authors(articles, top_authors):
    
    def all_authors_in_top(article, top_authors):
        for author in article['authors'].split(","):
            if author not in top_authors:
                return False
        return True
    
    
    return [article for article in articles if all_authors_in_top(article, top_authors)]

In [97]:
filtered_articles = filter_articles_containing_only_top_authors(articles_tokenized, top_authors)

In [98]:
print (len(articles_tokenized))
print (len(filtered_articles))

529247
5199


In [99]:
def calculate_frequency_of_words(articles, frequency_cap = 50):
    frequency = collections.defaultdict(int)
    for article in articles:
        for word in article['abstract']:
            frequency[word] += 1
            
    top_words = list(sorted(frequency.items(), key=itemgetter(1),reverse=True))
    top_words = [word[0] for word in top_words if word[1] > frequency_cap]
    return top_words

def filter_unfrequent_words_from_abstract(articles, top_words):
    articles_copy = copy.deepcopy(articles)
    for article in articles_copy:
        article['abstract'] = [word for word in article['abstract'] if word in top_words]
    return articles_copy
        
top_words = calculate_frequency_of_words(filtered_articles)
filtered_articles_with_frequent_words = filter_unfrequent_words_from_abstract(filtered_articles, top_words)

In [100]:
filtered_articles_with_frequent_words

[{'quoted': ['165'],
  'title': 'Spatial Data Structures.',
  'authors': 'Hanan Samet',
  'date': '1995',
  'source': 'Modern Database Systems',
  'index': '25',
  'abstract': ['An',
   'overview',
   'presented',
   'use',
   'spatial',
   'data',
   'structures',
   'spatial',
   'databases.',
   'The',
   'focus',
   'hierarchical',
   'data',
   'including',
   'number',
   'data',
   'respect',
   'space',
   'it.',
   'Such',
   'techniques',
   'known',
   'spatial',
   'indexing',
   'methods.',
   'data',
   'structures',
   'based',
   'recursive',
   'They',
   'nature',
   'data',
   'space',
   'well',
   'time',
   'also',
   'facilitate',
   'operations',
   'given',
   'use',
   'data',
   'structures',
   'representation',
   'different',
   'data',
   'types']},
 {'quoted': [],
  'title': 'ACTA: The SAGA Continues',
  'authors': 'Panos K. Chrysanthis,Krithi Ramamritham',
  'date': '1992',
  'source': 'Database Transaction Models for Advanced Applications',
  'index': 

In [102]:
def update_quoted_by(articles):
    all_indexes = set()
    for article in articles:
        all_indexes.add(article['index'])
    articles_copy = copy.deepcopy(articles)
    for article in articles_copy:
        updated_quoted = [quoted for quoted in article['quoted'] if quoted in all_indexes]
        article['quoted'] = updated_quoted
    return articles_copy
        
    
update_quoted_by(filtered_articles_with_frequent_words)
    

[{'quoted': [],
  'title': 'Spatial Data Structures.',
  'authors': 'Hanan Samet',
  'date': '1995',
  'source': 'Modern Database Systems',
  'index': '25',
  'abstract': ['An',
   'overview',
   'presented',
   'use',
   'spatial',
   'data',
   'structures',
   'spatial',
   'databases.',
   'The',
   'focus',
   'hierarchical',
   'data',
   'including',
   'number',
   'data',
   'respect',
   'space',
   'it.',
   'Such',
   'techniques',
   'known',
   'spatial',
   'indexing',
   'methods.',
   'data',
   'structures',
   'based',
   'recursive',
   'They',
   'nature',
   'data',
   'space',
   'well',
   'time',
   'also',
   'facilitate',
   'operations',
   'given',
   'use',
   'data',
   'structures',
   'representation',
   'different',
   'data',
   'types']},
 {'quoted': [],
  'title': 'ACTA: The SAGA Continues',
  'authors': 'Panos K. Chrysanthis,Krithi Ramamritham',
  'date': '1992',
  'source': 'Database Transaction Models for Advanced Applications',
  'index': '2194