In [2]:
# First install python-docx if you haven't already:
# pip install python-docx

from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml import OxmlElement
from docx.oxml.ns import qn

def create_ngram_report():
    # Create a new Document
    doc = Document()
    
    # Set document margins
    sections = doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
    
    # Title
    title = doc.add_heading('N-Gram Language Model Assignment Report', 0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    # Executive Summary
    doc.add_heading('Executive Summary', level=1)
    doc.add_paragraph(
        'This report analyzes the implementation and performance of N-gram language models trained on a corpus of 6.2 million tokens from multiple sources. Our experiments demonstrate that trigram models (n=3) achieve optimal performance with a perplexity of 36.20, representing a 57.4% improvement over bigram models. The implementation successfully addresses key challenges including out-of-vocabulary words through smoothing techniques and computational efficiency through optimized data structures.'
    )
    
    # 1. Introduction
    doc.add_heading('1. Introduction', level=1)
    doc.add_paragraph(
        'N-gram models serve as fundamental building blocks in natural language processing, providing probabilistic predictions of word sequences based on local context. This project implements and evaluates N-gram models of varying orders (2-5) on a diverse corpus combining Reuters, Brown, Gutenberg, and web text sources.'
    )
    
    # 2. Methodology
    doc.add_heading('2. Methodology', level=1)
    
    doc.add_heading('2.1 Data Collection and Preprocessing', level=2)
    doc.add_paragraph(
        'The experimental setup utilized a comprehensive corpus drawn from four distinct sources within the NLTK package, ensuring diverse linguistic patterns and domains. The data underwent systematic preprocessing to enhance model quality:'
    )
    
    p = doc.add_paragraph()
    p.add_run('Corpus Statistics:').bold = True
    
    # Add bullet points for statistics
    doc.add_paragraph('Total tokens: 6,196,290', style='List Bullet')
    doc.add_paragraph('Training set: 4,957,032 tokens (80%)', style='List Bullet')
    doc.add_paragraph('Testing set: 1,239,258 tokens (20%)', style='List Bullet')
    doc.add_paragraph('Vocabulary size: 106,731 unique tokens', style='List Bullet')
    
    doc.add_paragraph(
        'The preprocessing pipeline removed punctuation marks, numerical values, and other non-linguistic noise to create a cleaner token stream. This standardization process proved crucial for reducing sparsity and improving model generalization.'
    )
    
    doc.add_heading('2.2 Model Architecture', level=2)
    doc.add_paragraph(
        'The N-gram model implementation centers on an efficient dictionary-based architecture for storing context-word frequency pairs. This design choice offers several advantages:'
    )
    
    doc.add_paragraph('Constant-time lookups: O(1) average case complexity for frequency retrieval', style='List Bullet')
    doc.add_paragraph('Memory efficiency: Direct mapping from (context, word) tuples to frequencies', style='List Bullet')
    doc.add_paragraph('Scalability: Handles the 106,731-word vocabulary without performance degradation', style='List Bullet')
    
    doc.add_paragraph('The probability calculation follows the maximum likelihood estimation approach:')
    
    # Add formula as a styled paragraph
    formula = doc.add_paragraph()
    formula.alignment = WD_ALIGN_PARAGRAPH.CENTER
    formula_run = formula.add_run('P(w_i | w_{i-n+1}, ..., w_{i-1}) = count(w_{i-n+1}, ..., w_i) / count(w_{i-n+1}, ..., w_{i-1})')
    formula_run.font.name = 'Courier New'
    formula_run.font.size = Pt(10)
    
    doc.add_heading('2.3 Smoothing Techniques', level=2)
    doc.add_paragraph(
        'To address the zero-probability problem for unseen N-grams, three smoothing methods were implemented:'
    )
    
    # Numbered list
    doc.add_paragraph('Laplace Smoothing: Adds a constant α to all counts, ensuring non-zero probabilities', style='List Number')
    doc.add_paragraph('Stupid Backoff: Recursively backs off to lower-order N-grams with a penalty factor', style='List Number')
    doc.add_paragraph('Kneser-Ney Smoothing: Employs sophisticated discounting based on continuation probabilities', style='List Number')
    
    doc.add_paragraph(
        'The adjustable hyperparameter α allows fine-tuning the smoothing strength based on corpus characteristics.'
    )
    
    # 3. Results and Analysis
    doc.add_heading('3. Results and Analysis', level=1)
    
    doc.add_heading('3.1 Perplexity Evaluation', level=2)
    doc.add_paragraph(
        'Perplexity measurements across different N-gram orders reveal compelling insights:'
    )
    
    # Create table
    table = doc.add_table(rows=5, cols=3)
    table.style = 'Light Grid Accent 1'
    
    # Add header row
    header_cells = table.rows[0].cells
    header_cells[0].text = 'N-gram Order'
    header_cells[1].text = 'Perplexity'
    header_cells[2].text = 'Relative Change'
    
    # Add data rows
    data = [
        ['2 (Bigram)', '85.09', 'Baseline'],
        ['3 (Trigram)', '36.20', '-57.4%'],
        ['4 (4-gram)', '36.54', '+0.9%'],
        ['5 (5-gram)', '40.55', '+11.0%']
    ]
    
    for i, row_data in enumerate(data):
        row_cells = table.rows[i+1].cells
        for j, cell_data in enumerate(row_data):
            row_cells[j].text = cell_data
            # Bold the trigram row
            if i == 1:
                for paragraph in row_cells[j].paragraphs:
                    for run in paragraph.runs:
                        run.bold = True
    
    doc.add_paragraph(
        'The trigram model emerges as the optimal configuration, achieving the lowest perplexity of 36.20. This represents a dramatic 57.4% improvement over bigram models, indicating that two-word contexts capture significant linguistic dependencies.'
    )
    
    doc.add_heading('3.2 Performance Analysis', level=2)
    doc.add_paragraph('The results demonstrate a clear pattern:')
    
    # Add bullet points with bold keywords
    p1 = doc.add_paragraph('', style='List Bullet')
    p1.add_run('Sharp improvement').bold = True
    p1.add_run(' from bigram to trigram models suggests that two-word contexts provide substantial predictive power')
    
    p2 = doc.add_paragraph('', style='List Bullet')
    p2.add_run('Diminishing returns').bold = True
    p2.add_run(' beyond trigrams indicate that longer contexts introduce sparsity without proportional gains')
    
    p3 = doc.add_paragraph('', style='List Bullet')
    p3.add_run('Performance degradation').bold = True
    p3.add_run(' at n=5 likely stems from overfitting to training data patterns')
    
    doc.add_heading('3.3 Text Generation Quality', level=2)
    doc.add_paragraph(
        'The sentence generation module, which iteratively predicts words based on context, produced coherent outputs for trigram models. The generation process maintains linguistic fluency by:'
    )
    
    doc.add_paragraph('Starting with high-probability seed contexts', style='List Bullet')
    doc.add_paragraph('Applying smoothing to handle novel word combinations', style='List Bullet')
    doc.add_paragraph('Terminating at specified lengths or natural boundaries', style='List Bullet')
    
    # 4. Challenges and Solutions
    doc.add_heading('4. Challenges and Solutions', level=1)
    
    doc.add_heading('4.1 Computational Complexity', level=2)
    p = doc.add_paragraph()
    p.add_run('Challenge:').bold = True
    p.add_run(' Training time increases exponentially as n decreases, particularly problematic for unigram and bigram models processing 6M+ tokens.')
    
    p = doc.add_paragraph()
    p.add_run('Solution:').bold = True
    p.add_run(' Implemented optimization strategies including:')
    
    doc.add_paragraph('Batch processing of N-gram extraction', style='List Bullet')
    doc.add_paragraph('Efficient string manipulation using optimized Python libraries', style='List Bullet')
    doc.add_paragraph('Pre-computation of frequently accessed statistics', style='List Bullet')
    
    doc.add_heading('4.2 Memory Management', level=2)
    p = doc.add_paragraph()
    p.add_run('Challenge:').bold = True
    p.add_run(' Storing frequency counts for all possible N-grams with a 106,731-word vocabulary threatens memory exhaustion.')
    
    p = doc.add_paragraph()
    p.add_run('Solution:').bold = True
    
    doc.add_paragraph('Sparse representation storing only observed N-grams', style='List Bullet')
    doc.add_paragraph('Dynamic pruning of low-frequency entries', style='List Bullet')
    doc.add_paragraph('Efficient dictionary implementation with hash-based lookups', style='List Bullet')
    
    doc.add_heading('4.3 Data Sparsity', level=2)
    p = doc.add_paragraph()
    p.add_run('Challenge:').bold = True
    p.add_run(' Higher-order models suffer from severe sparsity, with many valid word sequences appearing zero times in training data.')
    
    p = doc.add_paragraph()
    p.add_run('Solution:').bold = True
    p.add_run(' The three-pronged smoothing approach ensures robust probability estimates even for unseen sequences, with Kneser-Ney smoothing providing particularly effective handling of rare events.')
    
    # 5. Conclusions and Future Work
    doc.add_heading('5. Conclusions and Future Work', level=1)
    doc.add_paragraph(
        'This implementation successfully demonstrates the practical application of N-gram language models on a substantial corpus. Key findings include:'
    )
    
    doc.add_paragraph('Optimal Model Order: Trigram models provide the best balance between contextual information and data sparsity', style='List Number')
    doc.add_paragraph('Smoothing Importance: Proper smoothing techniques are essential for handling the long tail of language', style='List Number')
    doc.add_paragraph('Efficiency Considerations: Careful implementation choices enable processing of multi-million token corpora', style='List Number')
    doc.add_paragraph('Dataset Scale: Larger datasets can significantly improve prediction accuracy by providing more comprehensive coverage of language patterns and reducing the impact of data sparsity', style='List Number')
    
    doc.add_heading('Future Directions', level=2)
    doc.add_paragraph('Several avenues for improvement merit exploration:')
    
    future_items = [
        ('Larger Dataset Integration:', ' Expanding the training corpus beyond 6M tokens to capture more diverse linguistic patterns and improve model generalization'),
        ('Adaptive N-gram Selection:', ' Dynamically choosing N based on available context'),
        ('Class-based Models:', ' Reducing sparsity through word clustering'),
        ('Neural Augmentation:', ' Combining N-gram statistics with neural embeddings'),
        ('Domain Adaptation:', ' Investigating performance across specific text genres'),
        ('Parallel Processing:', ' Implementing distributed training for handling massive datasets efficiently')
    ]
    
    for title, desc in future_items:
        p = doc.add_paragraph('', style='List Bullet')
        p.add_run(title).bold = True
        p.add_run(desc)
    
    doc.add_paragraph(
        'The success of this implementation validates N-gram models as robust baselines for language modeling tasks, while highlighting areas where modern neural approaches might provide advantages. The achieved perplexity of 36.20 for trigram models represents strong performance given the model\'s simplicity and interpretability.'
    )
    
    # References
    doc.add_heading('References', level=1)
    
    references = [
        'Chen, S. F., & Goodman, J. (1996). An empirical study of smoothing techniques for language modeling. Proceedings of the 34th Annual Meeting of the ACL.',
        'Kneser, R., & Ney, H. (1995). Improved backing-off for M-gram language modeling. Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing.',
        'Manning, C. D., & Schütze, H. (1999). Foundations of Statistical Natural Language Processing. MIT Press.'
    ]
    
    for i, ref in enumerate(references, 1):
        doc.add_paragraph(f'{i}. {ref}', style='List Number')
    
    # Save the document
    doc.save('ngram_model_report.docx')
    print("Report saved as 'ngram_model_report.docx'")

# Run the function to create the document
if __name__ == "__main__":
    create_ngram_report()

Report saved as 'ngram_model_report.docx'
