## PCFG Parsing

In [2]:
import sys
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.parse import ChartParser
from nltk import PCFG, ViterbiParser
from collections import Counter
nltk.download(['punkt', 'averaged_perceptron_tagger'])

df = pd.read_csv("NLPCleanData.csv")
print(f"CSV shape: {df.shape}")
print(f"Number of unique authors: {df['Author'].nunique()}")
print(f"Total samples: {len(df)}")

CSV shape: (12, 3)
Number of unique authors: 12
Total samples: 12


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emmavirnelli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emmavirnelli/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Label encode thhe gender column
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Now Male=0, Female=1

# Seperate texts by gender
male_texts = df[df['Gender'] == 0]['Sample'].tolist()
female_texts = df[df['Gender'] == 1]['Sample'].tolist()

all_text = male_texts + female_texts

This step extracts the grammar rules directly from our dataset so that the rules reflect the actual patterns used by male and female authors in nature writing. Instead of using generic grammar rules, we're building custom rules based on how these specific authors structure their sentences.



In [5]:
# Create grammar from you data in order to find what patterns exists
# Look for what words appear (according to POS tagging and Penn Treebank)

def find_grammar_rules(texts, max_words=20):
    # Extract all words with POS tags
    all_tagged = []
    for text in texts:
        sentences = nltk.sent_tokenize(str(text))
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            tagged = nltk.pos_tag(words)
            all_tagged.extend(tagged)
    
    # Group words by POS category
    categories = {
        'N': [],  # Nouns
        'V': [],  # Verbs
        'Adj': [],  # Adjectives
        'Adv': [],  # Adverbs
        'Det': [],  # Determiners
    }
    
    # POS to category mapping, reference Penn Treebank to corrrectly identify each category
    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html 
    pos_mapping = {
        'NN': 'N', 'NNS': 'N', 'NNP': 'N', 'NNPS': 'N', # Noun (singular or mass), plural noun, proper nouns (both singular and  plural)
        'VB': 'V', 'VBD': 'V', 'VBG': 'V', 'VBN': 'V', 'VBP': 'V', 'VBZ': 'V', # Verb (base and past form), gerund/present participle, past participle, non-, 3rd person singualr person
        'JJ': 'Adj', 'JJR': 'Adj', 'JJS': 'Adj', # Adjectives,  comparative and superlative adjectives 
        'RB': 'Adv', 'RBR': 'Adv', 'RBS': 'Adv', # Adverb, comparative and superlative adverbs
        'DT': 'Det'} # Determiner 
    
    # Define stopwords
    stopwords = {'to', 'from', 'not', 'my', 'so', 'do', 'one', 
                 'the', 'of', 'and', 'in', 'a', 'on', 'or',
                 'as', 'that', 'there', 'about', 'up', 'no',
                 'was', 'is', 'are', 'were', 'had', 'have', 'has',
                 'be', 'been', 'this', 'all', 'an', 's', 't',
                 'more', 'other', 'only', 'very', 'most', 'than',
                 'also', 'any', 'can', 'could', 'would', 'should',
                 'may', 'might', 'must', 'will', 'shall'}
    
    # Collect words for each category with filtering 
    word_counts_by_category = {cat: Counter() for cat in categories}
    
    for word, pos in all_tagged:
        word_lower = word.lower()
        
        if (len(word_lower) <= 2 or 
            word_lower in stopwords or
            not word_lower.isalpha()):
            continue
            
        if pos in pos_mapping:
            category = pos_mapping[pos]
            word_counts_by_category[category][word_lower] += 1
            if word_lower not in categories[category]:
                categories[category].append(word_lower)
    
    print("Top Words:")
    for category in ['N', 'V', 'Adj', 'Adv', 'Det']:
        counter = word_counts_by_category[category]
        if counter:
            top_words = counter.most_common(max_words)
            print(f"\n{category} (Top {len(top_words)}):")
            for i, (word, count) in enumerate(top_words, 1):
                print(f"  {i:2d}. {word:<15} ({count})")
    
    print("Counts of Each Category:")
    for cat, counter in word_counts_by_category.items():
        total_words = sum(counter.values())
        unique_words = len(counter)
        print(f"  {cat}: {total_words} total, {unique_words} unique")
    
    # Get most frequent words 
    word_counts = Counter([word.lower() for word, _ in all_tagged 
                          if word.lower().isalpha() and len(word.lower()) > 2])
    
    # Build PCFG with probabilities
    grammar_rules = []

    # Update the already made categories with the referenced probabilities
    # https://gawron.sdsu.edu/compling/course_core/lectures/pcfg/prob_parse.htm
    
    # NP + VP (most common pattern) and probability must add up to 1
    grammar_rules.append("S -> NP VP [1.0]")
    grammar_rules.append("NP -> Det N [0.4] | N [0.4] | Det Adj N [0.2]")
    grammar_rules.append("VP -> V [0.3] | V NP [0.4] | V NP PP [0.3]")
    
    # Prepositional phrase always contains P + NP
    grammar_rules.append("PP -> P NP [1.0]")
    
    
    # Calculate total counts for each category to normalize probabilities
    # This sums up all word occurrences within each POS category
    category_totals = {}
    for cat in ['N', 'V', 'Adj', 'Adv', 'Det']:
        category_totals[cat] = sum(word_counts_by_category[cat].values())
    
    # Add words for each category 
    for category in ['N', 'V', 'Adj', 'Adv', 'Det']:
        if categories[category]:
            # Get top words by frequency
            cat_words = [w for w in categories[category] 
                       if w in word_counts and w not in stopwords]
            cat_words.sort(key=lambda w: word_counts[w], reverse=True)
            top_words = cat_words[:max_words]
            
            if top_words:
                # Calculate probabilities based on word frequencies in this category
                rule_parts = []
                total_in_category = category_totals.get(category, 1)  # Avoid division by zero for no erros
                
                for word in top_words:
                    word_count = word_counts_by_category[category].get(word, 0)
                    if total_in_category > 0:
                        # Raw probability = frequency in category
                        raw_prob = word_count / total_in_category
                        rule_parts.append((word, raw_prob))
                
                # Normalize so all probabilities for this rule sum to 1, all expansions of a non-terminal must sum to 1
                total_prob = sum(prob for _, prob in rule_parts)
                
                if total_prob > 0:
                    normalized_parts = []
                    for word, prob in rule_parts:
                        normalized_prob = prob / total_prob
                        # Round to 4 decimal places for readability
                        normalized_parts.append(f"'{word}' [{normalized_prob:.4f}]")
                    
                    rule_str = " | ".join(normalized_parts)
                    grammar_rules.append(f"{category} -> {rule_str}")
                else:
                    # If we don't have frequency data, just split the probability evenly
                    equal_prob = 1.0 / len(top_words)
                    probs = [f"'{w}' [{equal_prob:.4f}]" for w in top_words]
                    rule_str = " | ".join(probs)
                    grammar_rules.append(f"{category} -> {rule_str}")

    
    # Prepositions and conjuncstions based on how often they show up in English, based on my own bias and reference
    grammar_rules.append(
        "P -> 'in' [0.15] | 'on' [0.10] | 'at' [0.08] | 'with' [0.08] | "
        "'for' [0.10] | 'to' [0.12] | 'from' [0.07] | 'of' [0.20] | 'by' [0.05] | 'about' [0.05]"
    )
    
    # Conjunctions with approximate frequencies, 'and' dominates due to usage
    grammar_rules.append("Conj -> 'and' [0.7] | 'or' [0.2] | 'but' [0.1]")
    
    grammar_str = "\n".join(grammar_rules)
    
    print(f"({len(grammar_rules)} grammar rules total):")
    print(grammar_str)
    
    # Return as PCFG 
    from nltk import PCFG
    return PCFG.fromstring(grammar_str)

In [6]:
grammar = find_grammar_rules(all_text)

Top Words:

N (Top 20):
   1. time            (112)
   2. years           (98)
   3. world           (73)
   4. people          (72)
   5. black           (69)
   6. water           (68)
   7. life            (64)
   8. day             (63)
   9. buffaloes       (62)
  10. insects         (61)
  11. animals         (60)
  12. way             (58)
  13. colours         (57)
  14. country         (56)
  15. herd            (55)
  16. flies           (55)
  17. house           (51)
  18. river           (51)
  19. light           (49)
  20. year            (47)

V (Top 20):
   1. being           (68)
   2. found           (61)
   3. made            (57)
   4. get             (48)
   5. did             (45)
   6. make            (45)
   7. know            (43)
   8. came            (41)
   9. seen            (41)
  10. got             (38)
  11. see             (38)
  12. come            (36)
  13. used            (36)
  14. killed          (34)
  15. think           (34)
  16. take       

The find_grammar_rules(all_text) function identifies the top 20 most frequent words in each of five POS categories: nouns, verbs, adjectives, adverbs, and determiners (top 9 for determiners due to their limited variety). The high-frequency words form the grammar's vocabulary, which increases the likelihood of generating coherent, corpus-representative sentences.
The grammar consists of 11 rules total:

5 lexical rules (N, V, Adj, Adv, Det): automatically populated with the extracted high-frequency words
6 structural rules (S, NP, VP, PP, P, Conj): manually defined to specify sentence structure and phrase composition.

In [8]:
female_grammer = find_grammar_rules(female_texts)

Top Words:

N (Top 20):
   1. time            (67)
   2. buffaloes       (62)
   3. years           (59)
   4. insects         (56)
   5. water           (56)
   6. colours         (55)
   7. flies           (55)
   8. herd            (54)
   9. country         (45)
  10. light           (43)
  11. colour          (43)
  12. fly             (42)
  13. animals         (41)
  14. elephant        (41)
  15. buffalo         (38)
  16. head            (35)
  17. pond            (35)
  18. trees           (34)
  19. day             (30)
  20. elephants       (30)

V (Top 20):
   1. being           (53)
   2. found           (45)
   3. made            (31)
   4. seen            (30)
   5. known           (23)
   6. killed          (23)
   7. did             (20)
   8. used            (18)
   9. get             (17)
  10. saw             (17)
  11. make            (16)
  12. see             (16)
  13. came            (16)
  14. think           (16)
  15. produced        (15)
  16. went        

In [9]:
male_grammer = find_grammar_rules(male_texts)

Top Words:

N (Top 20):
   1. black           (67)
   2. world           (61)
   3. people          (59)
   4. life            (47)
   5. time            (45)
   6. house           (43)
   7. years           (39)
   8. way             (37)
   9. birubi          (37)
  10. food            (34)
  11. day             (33)
  12. climate         (32)
  13. fish            (29)
  14. woods           (28)
  15. crisis          (26)
  16. men             (25)
  17. land            (25)
  18. new             (24)
  19. war             (24)
  20. death           (24)

V (Top 20):
   1. know            (32)
   2. want            (32)
   3. get             (31)
   4. got             (29)
   5. make            (29)
   6. don             (27)
   7. made            (26)
   8. did             (25)
   9. came            (25)
  10. take            (24)
  11. need            (23)
  12. see             (22)
  13. come            (21)
  14. say             (19)
  15. live            (19)
  16. used        

In [10]:
def parse_text_to_tree(text, grammar):
   
    if isinstance(grammar, PCFG):
        # PCFG needs Viterbi to find the best parse tree, finds the most probable verse
        parser = ViterbiParser(grammar)
    
    sentences = nltk.sent_tokenize(str(text))
    all_trees = []
    
    for sent in sentences:
        words = nltk.word_tokenize(sent.lower())
        
        try:
            # See if our grammar can parse this
            trees = list(parser.parse(words))
            
            if trees:
                # If can, sue the parsed tree
                tree = trees[0]
                all_trees.append(tree)
            else:
                # If cannot, use create simple POS tree
                # Parser ran fine but found zero valid parses for this sentence
                tagged = nltk.pos_tag(words)
                pos_tree = nltk.Tree('S', [nltk.Tree(pos, [word]) for word, pos in tagged])
                all_trees.append(pos_tree)

        # In case parser actually crashes (bad tokens, grammar issues, and so on)
        except Exception as e:
            # Create simple POS tree to fall back on
            tagged = nltk.pos_tag(words)
            pos_tree = nltk.Tree('S', [nltk.Tree(pos, [word]) for word, pos in tagged])
            all_trees.append(pos_tree)
    
    return all_trees

The parse_text_to_tree(text, grammar) takes each sentence in the text and tries to turn it into a full parse tree using your custom grammar. If the grammar can parse it, the sentence is broken into hierarchical branches like Sentence to Noun Phrase to Verb Phrase to individual words. If the grammar canâ€™t parse the sentence, the function falls back to a simpler tree made from POS tags, where each word becomes a leaf labeled with its part of speech. In the end, every sentence becomes a tree showing its grammatical structure.

In [12]:
# Store results
male_results = {
    'trees': [], # actual Tree objects
    'productions': [], # all grammar rules used (flattened across trees)
    'depths': [], # max depth of each tree
    'num_leaves': [] # terminal nodes (words) per tree
}

female_results = {
    'trees': [],
    'productions': [],
    'depths': [],
    'num_leaves': []
}

print("Process samples")

# Need to loop through every sample in the dataset in order to run through all the rules through each of the texts
for idx, row in df.iterrows():   
    text = row['Sample']
    trees = parse_text_to_tree(text, grammar) # Might get multiple trees if text has multiple sentences
    
    for tree in trees:
        # Extract production rules
        productions = tree.productions()
        
         #Bin results by gender (0 = male, 1 = female in this dataset)
        if row['Gender'] == 0:  
            
            male_results['trees'].append(tree) #
            male_results['productions'].extend(productions)
            male_results['depths'].append(tree.height())
            male_results['num_leaves'].append(len(tree.leaves()))
        # Female
        else: 
            female_results['trees'].append(tree)
            female_results['productions'].extend(productions)
            female_results['depths'].append(tree.height())
            female_results['num_leaves'].append(len(tree.leaves()))

print(f"  Male: {len(male_results['trees'])} trees, {len(male_results['productions'])} productions")
print(f"  Female: {len(female_results['trees'])} trees, {len(female_results['productions'])} productions")

Process samples
  Male: 1622 trees, 35812 productions
  Female: 1016 trees, 35379 productions


After parsing the trees through each text and storing the results, we then calculate the differences between the structure, the rules, and percentages, and display the results and differences between the rules that show up in male vs female writing. 

Compare production: 
- rule frequencies
- filter out non important differences (< 0.1%)
- compare tree depth and sentence length

In [26]:
# Count production frequencies
male_counter = Counter([str(p) for p in male_results['productions']])
female_counter = Counter([str(p) for p in female_results['productions']])

# Find the differences
# Looking at every rule that is found it either corpus
all_productions = set(male_counter.keys()).union(set(female_counter.keys()))
differences = []
for prod in all_productions:

    # Calculate what probability of the time this rule appears in each corpus
    male_freq = male_counter.get(prod, 0) / len(male_results['productions']) if male_results['productions'] else 0
    female_freq = female_counter.get(prod, 0) / len(female_results['productions']) if female_results['productions'] else 0
    
    # If differences are smaller than .001, filter out
    diff = female_freq - male_freq
    if abs(diff) > 0.001: 
        differences.append({
            'rule': prod,
            'male': male_freq * 100,
            'female': female_freq * 100,
            'diff': diff * 100
        })

# Sort by absolute difference
differences.sort(key=lambda x: abs(x['diff']), reverse=True)

# Display results
print(f"\nFound {len(differences)} differences")
print("\nTop 10 rules with largest gender differences:")
print(f"{'Rule':<50} {'Male %':<8} {'Female %':<8} {'Diff %':<8}")
for diff in differences[:10]:
    print(f"{diff['rule'][:50]:<50} {diff['male']:<8.2f} {diff['female']:<8.2f} {diff['diff']:<8.2f}")

print("\nTree Structure Comparison:")
def calculate_stats(values):
    if not values:
        return 0, 0, 0
    return np.mean(values), np.std(values), max(values)

male_stats = {
    'depth': calculate_stats(male_results['depths']),
    'leaves': calculate_stats(male_results['num_leaves'])
}
female_stats = {
    'depth': calculate_stats(female_results['depths']),
    'leaves': calculate_stats(female_results['num_leaves'])
}

print(f"{'Metric':<25} {'Male':<15} {'Female':<15} {'Difference':<15}")
print(f"{'Avg Tree Depth':<25} {male_stats['depth'][0]:<15.2f} {female_stats['depth'][0]:<15.2f} {female_stats['depth'][0] - male_stats['depth'][0]:<15.2f}")
print(f"{'Avg Leaves/Sentence':<25} {male_stats['leaves'][0]:<15.2f} {female_stats['leaves'][0]:<15.2f} {female_stats['leaves'][0] - male_stats['leaves'][0]:<15.2f}")
print(f"{'Max Tree Depth':<25} {male_stats['depth'][2]:<15} {female_stats['depth'][2]:<15} {female_stats['depth'][2] - male_stats['depth'][2]:<15}")
print(f"{'Max Leaves/Sentence':<25} {male_stats['leaves'][2]:<15} {female_stats['leaves'][2]:<15} {female_stats['leaves'][2] - male_stats['leaves'][2]:<15}")


Found 64 differences

Top 10 rules with largest gender differences:
Rule                                               Male %   Female % Diff %  
, -> ','                                           4.51     6.84     2.33    
. -> '.'                                           4.35     2.82     -1.53   
DT -> 'the'                                        5.16     6.25     1.09    
IN -> 'of'                                         2.55     3.31     0.77    
PRP -> 'we'                                        0.78     0.18     -0.60   
WDT -> 'which'                                     0.11     0.70     0.59    
: -> ';'                                           0.10     0.62     0.52    
PRP -> 'he'                                        0.62     0.24     -0.38   
: -> '--'                                          0.00     0.35     0.35    
IN -> 'in'                                         1.65     2.00     0.35    

Tree Structure Comparison:
Metric                    Male            Fem