# Generate Collocates and Keywords on Citation Context Corpus

This notebook illustrates how scite citation contexts corpus can be used to generate collocates and keywords by citation function and their basic corpus statistics.

This notebook uses a sample citation context set but the same process is used on the larger corpus

In [36]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.collocations import *
import pandas as pd
import csv

In [37]:
# these are required if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('universal_tagset')

In [38]:
stopword_list = stopwords.words('english')

def clean_tokenize_text(text):
    tokenized = nltk.word_tokenize(sentence)
    words = [word for word in tokenized if (
        word.isalpha() and
        word not in [
        'cite', 'sici', 'lt', 'gt',
        ] # remove citation extraction remenants
    )]
    return words

def keywords(tokens):
    keywords_count = {
        'VERB': {},
        'NOUN': {},
    }
    tags = nltk.pos_tag(tokens, tagset='universal')
    for tag in tags:
        if tag[0] in stopword_list:
            continue
       
        if tag[1] not in ['VERB', 'NOUN']:
            continue
        if tag[0] not in keywords_count[tag[1]]:
            keywords_count[tag[1]][tag[0]] = 1
        else:
            keywords_count[tag[1]][tag[0]] += 1
    return keywords_count

def total_tokens_len(tokens):
    return len(tokens)
    
def total_unique_tokens_len(tokens):
    return len(set(tokens))

def bigrams(tokens, contain_verb=False):
    finder = BigramCollocationFinder.from_words(tokens, window_size=5)
    return finder.ngram_fd.items()  

def trigrams(tokens, contain_verb=False):
    finder = TrigramCollocationFinder.from_words(tokens, window_size=5)
    return finder.ngram_fd.items()

def quadgrams(tokens, contain_verb=False):
    finder = QuadgramCollocationFinder.from_words(tokens, window_size=5)
    return finder.ngram_fd.items()

In [39]:
df = pd.read_csv('./citations_sample.csv')

In [40]:
total_tokens = []
supporting_tokens = []
disputing_tokens = []
mentioning_tokens = []
for i, row in df.iterrows():
    sentence = row['text']
    words = clean_tokenize_text(sentence)
    total_tokens.extend(words)
    if row['type'] == 'supporting':
        supporting_tokens.extend(words)
    if row['type'] == 'contradicting':
        disputing_tokens.extend(words)
    if row['type'] == 'mentioning':
        mentioning_tokens.extend(words)

In [None]:
citation_functions = {
#     'all': total_tokens,
    'supporting': supporting_tokens,
    'disputing': disputing_tokens,
    'mentioning': mentioning_tokens
}

print('Compiling citation data')
for k, v in citation_functions.items():
    print(f'For {k} citations')
    print(f'Total tokens are {total_tokens_len(v)}')
    print(f'Total unique tokens are {total_unique_tokens_len(v)}')
    print(f'Saving metadata for {k}')
    pd.DataFrame([
        [total_tokens_len(v), total_unique_tokens_len(v)]
    ] , columns =['total_tokens', 'total_unique_tokens']).to_csv(f'{k}_metadata.csv')
    
    print('Saving Verb and Noun Freq')
    words = keywords(v)
    pd.DataFrame(words['VERB'].items(), columns =['verbs', 'freq']).to_csv(f'{k}_verbs.csv')
    pd.DataFrame(words['NOUN'].items(), columns =['nouns', 'freq']).to_csv(f'{k}_nouns.csv')
    
    print('Saving Bigrams')
    bg = bigrams(v)
    pd.DataFrame(bg, columns =['bigrams', 'freq']).to_csv(f'{k}_bigrams.csv')

    print('Saving Trigrams')
    tg = trigrams(v)
    pd.DataFrame(tg, columns =['trigrams', 'freq']).to_csv(f'{k}_trigrams.csv')
    
    print('Saving Quadgrams')
    qg = quadgrams(v)
    pd.DataFrame(qg, columns =['quadgrams', 'freq']).to_csv(f'{k}_quadgrams.csv')
    

Compiling citation data
For supporting citations
Total tokens are 257221
Total unique tokens are 16462
Saving metadata for supporting
Saving Verb and Noun Freq
Saving Bigrams
Saving Trigrams
Saving Quadgrams
For disputing citations
Total tokens are 34640
Total unique tokens are 4489
Saving metadata for disputing
Saving Verb and Noun Freq
Saving Bigrams
Saving Trigrams
Saving Quadgrams
For mentioning citations
Total tokens are 6326855
Total unique tokens are 108675
Saving metadata for mentioning
Saving Verb and Noun Freq
Saving Bigrams
Saving Trigrams
