In [1]:
import re


eng_path='./en_eslspok-ud-dev.txt'

In [2]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [3]:
def tokenize_rule_based(text):
    # Regex patterns for URLs, emails, hashtags
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    email_pattern = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
    hashtag_pattern = r'#\w+'
    
    # Multi-word expressions (MWEs) you want to capture as single tokens
    mwes = [('new', 'york'), ('artificial', 'intelligence')]

    # Combine all patterns into a single regex
    combined_pattern = f'({url_pattern})|({email_pattern})|({hashtag_pattern})|(\w+|[^\w\s])'

    # Step 1: Tokenize using the combined pattern
    tokens = re.findall(combined_pattern, text.lower())
    tokens = [token for group in tokens for token in group if token]

    # Step 2: Handle MWEs manually by scanning for sequences
    def join_mwes(tokens, mwes):
        mwe_set = set(mwes)  # Convert list of MWEs to set for faster look-up
        i = 0
        final_tokens = []
        while i < len(tokens):
            if i + 1 < len(tokens) and (tokens[i], tokens[i + 1]) in mwe_set:
                final_tokens.append(f'{tokens[i]} {tokens[i + 1]}')
                i += 2  # Skip the next token as it's part of an MWE
            else:
                final_tokens.append(tokens[i])
                i += 1
        return final_tokens
    
    # Step 3: Apply MWE handling
    tokens = join_mwes(tokens, mwes)
    
    return tokens

In [4]:
def remove_stopwords(tokens, stopwords):
    return [token for token in tokens if token not in stopwords]

In [5]:
def preprocess_abbreviations(text):
    # Replace abbreviations with temporary markers to prevent splitting
    abbreviations = {
        'mr.': 'MR_TEMP',
        'mrs.': 'MRS_TEMP',
        'dr.': 'DR_TEMP',
        'ms.': 'MS_TEMP',
        'etc.': 'ETC_TEMP',
        'i.e.': 'IE_TEMP',
        'e.g.': 'EG_TEMP',
        'u.s.': 'US_TEMP',
        'p.m.': 'PM_TEMP',
        'a.m.': 'AM_TEMP'
    }
    for abbr, placeholder in abbreviations.items():
        text = text.replace(abbr, placeholder)
    return text

def postprocess_abbreviations(text):
    # Revert the placeholders back to their original abbreviations
    abbreviations = {
        'MR_TEMP': 'Mr.',
        'MRS_TEMP': 'Mrs.',
        'DR_TEMP': 'Dr.',
        'MS_TEMP': 'Ms.',
        'ETC_TEMP': 'etc.',
        'IE_TEMP': 'i.e.',
        'EG_TEMP': 'e.g.',
        'US_TEMP': 'U.S.',
        'PM_TEMP': 'p.m.',
        'AM_TEMP': 'a.m.'
    }
    for placeholder, abbr in abbreviations.items():
        text = text.replace(placeholder, abbr)
    return text

In [6]:
def split_sentences(text):
    # Preprocess text to handle abbreviations
    text = preprocess_abbreviations(text)
    
    # Use regex to split sentences on common sentence-ending punctuation
    sentence_end_pattern = r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)'
    sentences = re.split(sentence_end_pattern, text)
    
    # Postprocess text to restore abbreviations
    sentences = [postprocess_abbreviations(s).strip() for s in sentences if s]
    
    return sentences

In [7]:
def process_text(file_path):
    text = read_file(file_path)
    tokens = tokenize_rule_based(text)
    sentences = split_sentences(text)
    stopwords = {"the", "is", "in", "it", "of", "and"}  # Example stopwords
    tokens = remove_stopwords(tokens, stopwords)
    
    return tokens, sentences

In [8]:
tokens, sentences = process_text(eng_path)
tokens, len(tokens)

FileNotFoundError: [Errno 2] No such file or directory: './en_eslspok-ud-dev.txt'

In [None]:
sentences, len(sentences)

(["And so this accident became the man on the motorcycle 's fault .",
  'Yes , please .',
  'And then think about what we really wanna eat .',
  "the roads are n't this wide ,\nfirst of all .",
  "So yeah , I do n't know about others .",
  'he runs every morning\naround here I like Tom Cruise .',
  'And she was looking for a nice gift for him .',
  'One girl is sitting on the chair before the desk .',
  'But very interesting .',
  'in\nhigher level of English or skills in the area .',
  'And ballet is the decide to\nmovement .',
  'because life .',
  'My father , my mother , and two sisters .',
  'yes , yes .',
  "so I 'm very worried about maybe he think to play good person or try to be good\nfor adult .",
  'I hope so .',
  "And I just forgot about , you know , giving you the\nphone that I could n't come .",
  'last weekend my best friend in Tokyo came to Osaka\nwith his girlfriend to go to Universal Studios in Japan .',
  'So I just sleep well .',
  'But in here , like people are ju