In [None]:
import regex
from collections import Counter

COMMON_TLDS = {
    'com', 'org', 'net', 'in', 'edu', 'gov', 'mil', 'info', 'biz', 'io', 'co',
    'ai', 'me', 'app', 'dev', 'xyz', 'us', 'uk', 'ca', 'de', 'jp', 'fr', 'au',
    'cn', 'ru', 'br', 'za', 'tv', 'cc', 'int', 'jobs', 'name', 'site',
    'tech', 'store', 'online', 'media', 'news', 'pro', 'live', 'cloud', 'blog',
    'club', 'solutions', 'services', 'today', 'tools'
}

def get_url_pattern(tlds):
    tld_group = '|'.join(tlds)
    return rf'(https?:\/\/)?(www\.)?[\w\-]+\.(?:{tld_group})'

def get_email_pattern(tlds):
    tld_group = '|'.join(tlds)
    return rf'\b[\w\.-]+@[\w\.-]+\.(?:{tld_group})\b'
URL_PATTERN = get_url_pattern(COMMON_TLDS)
EMAIL_PATTERN = get_email_pattern(COMMON_TLDS)
DATE_PATTERN = r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b'
NUMBER_PATTERN = r'\b\d+(\.\d+)?\b'
TELUGU_PATTERN = r'[\u0C00-\u0C7F]+'
ENGLISH_PATTERN = r'[a-zA-Z]+'
PUNCTUATION_PATTERN = r'[.,!?;:"()\[\]{}<>]'

TOKEN_PATTERN = f"({URL_PATTERN}|{EMAIL_PATTERN}|{DATE_PATTERN}|{NUMBER_PATTERN}|{TELUGU_PATTERN}|{ENGLISH_PATTERN}|{PUNCTUATION_PATTERN})"
token_regex = regex.compile(TOKEN_PATTERN)

sentence_count = 0
word_count = 0
char_count = 0
unique_tokens = set()

def split_sentences(text):
    safe = text.replace("...", "<ELLIPSIS>")
    parts = regex.split(r'(?<=[.!?])\s+', safe)
    return [p.replace("<ELLIPSIS>", "...") for p in parts]

def tokenize_sentence(sentence):
    tokens = token_regex.findall(sentence)
    tokens = [next(filter(None, group)) for group in tokens]
    return tokens

def clean_tokens(tokens):
    return [t for t in tokens if not regex.fullmatch(PUNCTUATION_PATTERN, t)]

input_path = "indiccorp_telugu.txt"
output_path = "tokenized_output.txt"

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in infile:
        paragraph = line.rstrip('\n') 

        if not paragraph.strip():
            outfile.write('\n')
            continue

        sentences = split_sentences(paragraph)

        output_sentences = []

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence or sentence == '...':
                continue

            tokens = tokenize_sentence(sentence)
            tokens = clean_tokens(tokens)

            if not tokens:
                continue

            sentence_count += 1
            word_count += len(tokens)
            char_count += sum(len(t) for t in tokens)
            unique_tokens.update(tokens)

            output_sentence = ' '.join(tokens)

            if not output_sentence.endswith('.'):
                output_sentence += '.'

            output_sentences.append(output_sentence)

        output_line = ' '.join(output_sentences)
        outfile.write(output_line + '\n')

avg_sent_len = word_count / sentence_count
avg_word_len = char_count / word_count
ttr = len(unique_tokens) / word_count

print("===== Corpus Statistics =====")
print(f"Total Sentences: {sentence_count}")
print(f"Total Words: {word_count}")
print(f"Total Characters: {char_count}")
print(f"Average Sentence Length (words): {avg_sent_len:.2f}")
print(f"Average Word Length (chars): {avg_word_len:.2f}")
print(f"Type/Token Ratio (TTR): {ttr:.4f}")


===== Corpus Statistics =====
Total Sentences: 76758409
Total Words: 767674423
Total Characters: 4943970359
Average Sentence Length (words): 10.00
Average Word Length (chars): 6.44
Type/Token Ratio (TTR): 0.0103
