In [None]:
import stanza
import pandas as pd

# Initializing the Stanza pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse')


# Original sentences - Highly likely to be Chinese to English direct translations
sentence1 = "Today is our dragon boat festival, in our Chinese culture, to celebrate it with all safe and great in our lives."
sentence2 = "Anyway, I believe the team, although bit delay and less communication at recent days, they really tried best for paper and cooperation."


# To help us with the grammar substitutions, POS tagging from stanza was used to identify the parts of speech of each word in the sentences
def stanza_pos_parser(sentence):
    doc = nlp(sentence)
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    pos_tags = [(word.text,word.upos) for sent in doc.sentences for word in sent.words] 

    print("Stanza pipeline tokenization And Part Of Speech Tagging")
    return tokens, pos_tags

# Parsing the sentences to get tokens and POS tags
tokens1, pos_tags1 = stanza_pos_parser(sentence1) 
tokens2, pos_tags2 = stanza_pos_parser(sentence2) 

# Using pandas to create a DataFrame for better visualization of the POS tags
def print_pos_tags(pos_tags):
    stanza_dataframe = pd.DataFrame({
        'Token': [token for token, pos in pos_tags],
        'POS': [pos for token, pos in pos_tags]
    })
    
    print(stanza_dataframe) 

# print_pos_tags(pos_tags1)
# print_pos_tags(pos_tags2)


# Mostly phrase substitutions in Chinese to English direct translation, otherwise known as "Chinglish"
phrase_substitutions = {
    ("although", "bit", "delay"): ["regardless", "of", "the", "delays"],
    ("tried", "best"): ["tried", "their", "best"],
    ("at", "recent", "days"): ["recently"],
    ("for", "paper", "and", "cooperation"): ["on", "the", "paper", "and", "contributed", "to", "our", "collaboration"],
    ("our", "dragon", "boat", "festival") : ["the", "Dragon", "Boat", "Festival"],
    ("with", "all", "safe", "and", "great", "in", "our", "lives"): ["to", "wish", "for", "safety", "and", "prosperity"],
}

# Grammar-based custom substitution / modification rules 
grammar_substitutions = [
    {
        "word": "to",               # Target word to replace (hence, we have a replacement)
        "lemma": "to",              # Base form of the word
        "upos": "PART",             # POS tag: particle 
        "deprel": "mark",           # Dependency relation: marker (e.g., "to celebrate")
        "head_lemma": "celebrate",  # Ensure "to" is governed by the verb "celebrate"
        "replacement": ["we"]
    },
    {
        "word": "our",              # Target word to remove (hence, replacement is empty)
        "lemma": "our",            
        "upos": "DET",              # POS tag: determiner 
        "deprel": "det",            # Dependency relation: determiner (e.g., "our culture")
        "head_lemma": "culture",    # Ensure "our" is governed by the word "culture"
        "replacement": []         
    },
    {
        "word": "less",
        "lemma": "less",
        "upos": "ADJ",                  # POS tag: adjective
        "deprel": "amod",               # Dependency relation: adjectival modifier
        "head_lemma": "communication",  # Ensure "less" modifies "communication"
        "replacement": ["limited"]      # Replace "less" with "limited"
    },
    {
        "word": "they",             # Target word to remove (hence, replacement is empty)
        "lemma": "they",           
        "upos": "PRON",             # POS tag: pronoun
        "deprel": "nsubj",          # Dependency relation: nominal subject (e.g., "they tried")
        "head_lemma": "try",        # Ensure "they" is governed by the verb "try"
        "replacement": []
    },
]


# Function for applying phrase substitutions
def apply_phrase_substitutions(tokens):
    output = []
    i = 0
    
    # Sorting phrase substitutions by length of phrase in descending order, in order to match longer phrases first and avoid partial matches.
    # This is important since we want to prevent cases where a shorter phrase is matched before a longer one that may contain it.
    sorted_substitutions = sorted(phrase_substitutions.items(), key=lambda x: -len(x[0]))
    
    # We iterate through the tokens of the sentence
    while i < len(tokens):
        
        # Boolean variable to check if a match was found
        matched = False
        
        # For each phrase and its replacement, we check if the current sentence token matches it
        for phrase, replacement in sorted_substitutions:
            
            # If we have a match, we extend the output with the replacement 
            # and increment the index by the length of the phrase and break the loop since we took care of this phrase
            if tuple(tokens[i:i + len(phrase)]) == phrase:
                output.extend(replacement)
                i += len(phrase)
                matched = True
                break
             
        # If we had no match, we append the current unchanged token to the output and increment the index by 1
        if not matched:
            output.append(tokens[i])
            i += 1
            
    # We return the output with all the phrase substitutions applied             
    return output
    
    
# Function for applying grammar-based substitutions
def apply_grammar_substitutions(sentence):
    output = []
    
    # We iterate through each word in the sentence
    for word in sentence.words:
        
        # Boolean variable to check if a match with a rule was found
        match = False
        
        # We check each for each substitution rule in the grammar substitutions 
        for substitution in grammar_substitutions:
            
            # We check if the word text matches the target word in the substitution rule
            # For each check, if there is no match, we skip to the next substitution rule
            if word.text.lower() != substitution.get("word", word.text.lower()):
                continue
            
            # We check the lemma of the word against the substitution rule
            if "lemma" in substitution and word.lemma != substitution["lemma"]:
                continue
            
            # We check the universal part-of-speech tag
            if "upos" in substitution and word.upos != substitution["upos"]:
                continue
            
            # We check the dependency relation
            if "deprel" in substitution and word.deprel != substitution["deprel"]:
                continue
            
            # We check the head lemma to ensure the word we are modifying is the one we want
            if "head_lemma" in substitution:
                
                # If the word has no head, we skip to the next substitution rule
                # Essentially, this means we are looking for a specific context where the word is used in
                head = sentence.words[word.head - 1] if word.head > 0 else None
                if not head or head.lemma != substitution["head_lemma"]:
                    continue
            
            # We apply the changes by extending the output if all conditions met and break the loop
            output.extend(substitution["replacement"])
            match = True
            break
        
        # If there was no match, we append the original word to the output
        if not match:
            output.append(word.text)
    
    # We return the output with all the grammar substitutions applied
    return output


# Function for reconstructing the sentence with all the changes applied
def reconstruct(sentence):
    doc = nlp(sentence)
    final_tokens = []
    
    # We apply all the grammar and phrase substitutions made and reconstruct the sentence
    for sentence in doc.sentences:
        fixed_grammar = apply_grammar_substitutions(sentence)
        fixed_phrases = apply_phrase_substitutions(fixed_grammar)
        final_tokens.extend(fixed_phrases)
        
    # We join the final tokens into a single string and return it
    return " ".join(final_tokens)


# Reconstruction of the sentences - Printing the both the original and the reconstructed sentences
print("Original Sentence 1")
print(sentence1)
print("Reconstructed Sentence 1")
print(reconstruct(sentence1))

print("\n")

print("Original Sentence 2")
print(sentence2)
print("Reconstructed Sentence 2")
print(reconstruct(sentence2))


['Today', 'is', 'our', 'dragon', 'boat', 'festival', 'in', 'our', 'Chinese', 'culture', 'to', 'celebrate', 'it', 'with', 'all', 'safe', 'and', 'great', 'in', 'our', 'lives']
['Anyway', 'I', 'believe', 'the', 'team', 'although', 'bit', 'delay', 'and', 'less', 'communication', 'at', 'recent', 'days', 'they', 'really', 'tried', 'best', 'for', 'paper', 'and', 'cooperation']


In [1]:
import stanza

# Download once if needed
stanza.download('en')

# Initialize pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse')

def reconstruct(sentence):
    doc = nlp(sentence)
    output_tokens = []

    for sent in doc.sentences:
        for word in sent.words:
            # Replace awkward words/phrases based on lemma and dependency
            if word.text.lower() == "bit" and word.head and word.head.lemma == "delay":
                output_tokens.extend(["a", "bit", "of", "a", "delay"])
                continue

            if word.text.lower() == "less" and word.head and word.head.lemma == "communication":
                output_tokens.append("reduced")
                continue

            if word.text.lower() == "best" and word.head and word.head.lemma == "try":
                output_tokens.extend(["their", "best"])
                continue

            if word.text.lower() == "it" and word.deprel == "obj":
                output_tokens.append("the festival")
                continue

            output_tokens.append(word.text)

    # Capitalize first word
    if output_tokens:
        output_tokens[0] = output_tokens[0].capitalize()

    return " ".join(output_tokens)


# Original sentences
sentence1 = "Today is our dragon boat festival, in our Chinese culture, to celebrate it with all safe and great in our lives."
sentence2 = "Anyway, I believe the team, although bit delay and less communication at recent days, they really tried best for paper and cooperation."

# Run reconstructions
reconstructed1 = reconstruct(sentence1)
reconstructed2 = reconstruct(sentence2)

print("Reconstructed 1:")
print(reconstructed1)
print("\nReconstructed 2:")
print(reconstructed2)



ModuleNotFoundError: No module named 'stanza'