# Extracting collocations in Python


In this notebook, I will demonstrate how to extract collocations from a corpus.

## Preparation

### Let's load necessary package.

In [None]:
# Load packages

import spacy
import glob
import pandas as pd

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

### Let's specify the corpus path

In [21]:
CORPUS_FILES = glob.glob("../../../corpus_data/brown_single/*.txt")


# Define extraction codes

In [None]:

# Define dependency relations for collocation extraction
# Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun
COLLOCATION_PATTERNS = [
    ("amod", "ADJ", "NOUN"),     # adjective modifier (e.g., "big house")
    ("dobj", "VERB", "NOUN"),    # direct object (e.g., "eat food")
    ("advmod", "VERB", "ADV"),   # adverb modifier (e.g., "run quickly")
    ("nmod", "NOUN", "NOUN"),    # noun modifier (e.g., "cup of tea")
    ("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science")
]


In [23]:

def load_file(filepath):
    """Load text from file"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def extract_collocations(doc, patterns=COLLOCATION_PATTERNS):
    """Extract collocations based on dependency patterns"""
    collocations = []
    
    for token in doc:
        for dep_rel, pos1, pos2 in patterns:
            # Check if token matches the dependency pattern
            if token.dep_ == dep_rel:
                if dep_rel in ["amod", "advmod", "compound"]:
                    # modifier comes before head
                    if token.pos_ == pos1 and token.head.pos_ == pos2:
                        collocations.append((token.lemma_.lower(), token.head.lemma_.lower(), dep_rel))
                else:  # dobj, nmod
                    # head comes before dependent
                    if token.head.pos_ == pos1 and token.pos_ == pos2:
                        collocations.append((token.head.lemma_.lower(), token.lemma_.lower(), dep_rel))
    
    return collocations

def update_results(results, doc, collocations):
    """Update results dictionary with frequency information"""
    # Update corpus size
    token_size = len(doc)
    results["corpus_size"] += token_size
    
    # Update unigram frequencies
    for token in doc:
        token_lower = token.lemma_.lower()
        results["unigram"][token_lower] = results["unigram"].get(token_lower, 0) + 1
    
    # Update bigram frequencies
    for word1, word2, dep_rel in collocations:
        bigram_key = f"{word1}_{word2}_{dep_rel}"
        if bigram_key not in results["bigram"]:
            results["bigram"][bigram_key] = {
                "word1": word1,
                "word2": word2,
                "dep_rel": dep_rel,
                "freq": 0
            }
        results["bigram"][bigram_key]["freq"] += 1
    
    return results



# Now it's time to run the code to extract collocations

In [37]:
# Main processing loop
results = {"corpus_size": 0,
           "unigram": {},
           "bigram": {}}

for file in CORPUS_FILES:  # Process first 5 files for testing
    # 1. Load the corpus file
    text = load_file(file)
    
    # 2. Parse and identify collocations
    doc = nlp(text)
    collocations = extract_collocations(doc)
    
    # 3. Update results
    results = update_results(results, doc, collocations)
    
    print(f"Processed: {file}")

print(f"\nCorpus size: {results['corpus_size']} tokens")
print(f"Unique words: {len(results['unigram'])}")
print(f"Unique collocations: {len(results['bigram'])}")

Processed: ../../../corpus_data/brown_single/cf_cf08.txt
Processed: ../../../corpus_data/brown_single/ck_ck17.txt
Processed: ../../../corpus_data/brown_single/cf_cf20.txt
Processed: ../../../corpus_data/brown_single/cf_cf34.txt
Processed: ../../../corpus_data/brown_single/ck_ck03.txt
Processed: ../../../corpus_data/brown_single/ca_ca37.txt
Processed: ../../../corpus_data/brown_single/cl_cl14.txt
Processed: ../../../corpus_data/brown_single/ca_ca23.txt
Processed: ../../../corpus_data/brown_single/ch_ch26.txt
Processed: ../../../corpus_data/brown_single/ce_ce11.txt
Processed: ../../../corpus_data/brown_single/ce_ce05.txt
Processed: ../../../corpus_data/brown_single/cb_cb06.txt
Processed: ../../../corpus_data/brown_single/cb_cb12.txt
Processed: ../../../corpus_data/brown_single/cp_cp25.txt
Processed: ../../../corpus_data/brown_single/cg_cg68.txt
Processed: ../../../corpus_data/brown_single/cg_cg40.txt
Processed: ../../../corpus_data/brown_single/cj_cj77.txt
Processed: ../../../corpus_data

# Transform the data into dataset format

In the code above, the collocation has been extracted, but the data format is not human readable. In the following I will transform the data to other format.

In [38]:
def results_to_dataframe(results, min_freq=1):
    """
    Convert results dictionary to pandas DataFrame with additional options
    
    Parameters:
    - results: dictionary with corpus_size, unigram, and bigram data
    - min_freq: minimum collocation frequency to include (default: 1)
    - include_dep_rel: whether to include dependency relation in output (default: True)
    """
    rows = []
    
    for bigram_key, bigram_info in results["bigram"].items():
        # Skip if below minimum frequency
        if bigram_info['freq'] < min_freq:
            continue
            
        row = {
            "collocation": f"{bigram_info['word1']}_{bigram_info['word2']}",
            "word1": bigram_info['word1'],
            "word2": bigram_info['word2'],
            "dep_relation": bigram_info['dep_rel'],
            "corpus_size": results['corpus_size'],
            "collocation_frequency": bigram_info['freq'],
            "word1_freq": results["unigram"].get(bigram_info['word1'], 0),
            "word2_freq": results["unigram"].get(bigram_info['word2'], 0)
        }
        
        
        rows.append(row)
    
    # Create DataFrame and sort by collocation frequency
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values('collocation_frequency', ascending=False)
        df = df.reset_index(drop=True)
    
    return df

In [39]:



# Example with filtering
df_collocations = results_to_dataframe(results, min_freq=5)

# Display first 20 rows
print("Top 20 collocations:")
print(df_collocations.head(20))

# Show DataFrame info
print(f"\nTotal collocations found: {len(df_collocations)}")
print(f"\nDataFrame columns: {list(df_collocations.columns)}")

Top 20 collocations:
           collocation       word1  ... word1_freq word2_freq
0            last_year        last  ...        705       1789
1            same_time        same  ...        686       1970
2            young_man       young  ...        442       2149
3           take_place        take  ...       1585        836
4          fiscal_year      fiscal  ...        120       1789
5          high_school        high  ...        801        711
6              old_man         old  ...        857       2149
7           first_time       first  ...       1388       1970
8           last_night        last  ...        705        468
9           other_hand       other  ...       2032        801
10            per_cent         per  ...        380        194
11           last_week        last  ...        705        450
12        middle_class      middle  ...        178        352
13  nineteenth_century  nineteenth  ...         58        307
14           have_time        have  ...      1190

# Saving the data into tsv file

In [40]:
# Index = row number usually starts with zero. Adding one may help for human reader
df_collocations.index += 1

In [41]:
df_collocations.to_csv("../../../corpus_data/brown_collocations.tsv", sep = "\t",
                       index=True,
                       index_label="rank")

# Saving randomly sampled data

Now because the data is still big, let's filter them down to manageable size. 

First, we should filter it with frequency. Let's retain collocation that occur more than 10 times.

Second, let's randomly sample 50 from this pool.

In [44]:
df_collocations.query('collocation_frequency > 10')

Unnamed: 0,collocation,word1,word2,dep_relation,corpus_size,collocation_frequency,word1_freq,word2_freq
1,last_year,last,year,amod,1242331,123,705,1789
2,same_time,same,time,amod,1242331,94,686,1970
3,young_man,young,man,amod,1242331,89,442,2149
4,take_place,take,place,dobj,1242331,84,1585,836
5,fiscal_year,fiscal,year,amod,1242331,76,120,1789
...,...,...,...,...,...,...,...,...
375,mystery_story,mystery,story,compound,1242331,11,47,234
376,get_job,get,job,dobj,1242331,11,1460,308
377,high_cost,high,cost,amod,1242331,11,801,431
378,onset_age,onset,age,compound,1242331,11,43,284


In [46]:
df_collocations.query('collocation_frequency > 10').sample(n = 50, random_state=42).to_csv("../../../corpus_data/brown_collocations_random_50.tsv", sep = "\t",
                       index=True,
                       index_label="rank")