In [26]:
!pip install transformers



In [27]:
!pip install spacy



In [28]:
!pip install pyspellchecker



##Correcting punctuations

In [29]:
import re
def tokenize(text):
    tokens = re.findall(r"[\w']+|[.,!?;:()\"-]", text)
    return tokens

In [30]:
import re
def correct_punctuation(text):
    tokens = re.findall(r"[\w']+|[.,!?;:()\"-]", text)
    corrected_tokens = []
    previous_token = ""
    wh_words = {'who', 'what', 'when', 'where', 'why', 'how'}
    question_mark_needed = False
    for token in tokens:
        if token.lower() in wh_words:
            question_mark_needed = True
        if token == 'i':
            token = 'I'
        if token in [',', '.', ';', ':', '!', '?']:
            if previous_token and previous_token not in [',', '.', ';', ':', '!', '?']:
                corrected_tokens.append(token)
        elif token in ['"', "'", "(", ")", "[", "]"]:
            if token in ['"', "'"]:
                if corrected_tokens.count(token) % 2 == 0:
                    corrected_tokens.append(token)
                else:
                    corrected_tokens[-1] += token
            elif token == "(":
                corrected_tokens.append(token)
            elif token == ")":
                if corrected_tokens.count("(") > corrected_tokens.count(")"):
                    corrected_tokens.append(token)
            elif token == "[":
                corrected_tokens.append(token)
            elif token == "]":
                if corrected_tokens.count("[") > corrected_tokens.count("]"):
                    corrected_tokens.append(token)
        else:
            corrected_tokens.append(token)
        previous_token = token
     # Balance parentheses and brackets
    corrected_tokens.extend(")" * (corrected_tokens.count("(") - corrected_tokens.count(")")))
    corrected_tokens.extend("]" * (corrected_tokens.count("[") - corrected_tokens.count("]")))

    # Add a question mark at the end if needed
    if question_mark_needed and corrected_tokens[-1] not in '.!?':
        corrected_tokens.append('?')

    # Capitalize the first letter after a sentence
    for i in range(len(corrected_tokens) - 1):
        if corrected_tokens[i] in '.!?':
            if i + 1 < len(corrected_tokens) and corrected_tokens[i + 1].isalpha():
                corrected_tokens[i + 1] = corrected_tokens[i + 1].capitalize()

    # Capitalize the first letter of the text
    if corrected_tokens and corrected_tokens[0].isalpha():
        corrected_tokens[0] = corrected_tokens[0].capitalize()

    text = ' '.join(corrected_tokens)
    text = text.replace(" ,", ",")
    text = text.replace(" .", ".")
    text = text.replace(" !", "!")
    text = text.replace(" ?", "?")
    text = text.replace(" ;", ";")
    text = text.replace(" :", ":")
    text = text.replace(" ( ", " (")
    text = text.replace(" ) ", ") ")
    text = text.replace(" ' ", "'")
    text = text.replace(' "', '"')
    return text



##Spell checking

In [31]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [32]:
import spellchecker
import string
from spellchecker import SpellChecker
spell = SpellChecker()

In [33]:
#correct sentence
def spell_checker(sentence):
    corrected_sentence = []
    for word in sentence.split():
       word_part = ''.join([char for char in word if char.isalnum()])
       punctuation_part = ''.join([char for char in word if char in string.punctuation])
       if word_part.lower() not in spell:
           corrected_word = spell.correction(word_part)
           corrected_sentence.append(corrected_word + punctuation_part)
       else:
           corrected_sentence.append(word_part + punctuation_part)
    return ' '.join(corrected_sentence)

##Grammar correction using finetuned T5 transformer

In [74]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [75]:
tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [76]:
import torch
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [77]:
model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-small')

In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [79]:
model.load_state_dict(torch.load("/content/drive/MyDrive/modelT5f4.pt",map_location=device))

<All keys matched successfully>

In [80]:
def corrector2(text):
    input_ids = tokenizer(text, max_length= 64,return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length = 64)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

In [81]:
import nltk
nltk.download('punkt')
def correct_grammar2(paragraph):
    paragraph = correct_punctuation(paragraph)
    paragraph = spell_checker(paragraph)

    sentences = nltk.sent_tokenize(paragraph)
    corrected_sentences = []
    for sentence in sentences:
        corrected_sentence = corrector2(sentence)
        corrected_sentences.append(corrected_sentence)
    corrected_paragraph = ' '.join(corrected_sentences)
    return corrected_paragraph

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [82]:
sentence = 'how is you?'

In [83]:
correct_grammar2(sentence)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'How are you?'

##Tone improvements

In [84]:
!pip install gradio



In [85]:
import gradio as gr
from transformers import pipeline
import matplotlib.pyplot as plt
import io

# Load pre-trained emotion detection model
emotion_analyzer = pipeline('text-classification', model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

def analyze_tone(text):
    results = emotion_analyzer(text)
    emotions = {result['label']: result['score'] for result in results[0]}

    # Plot the emotions as a bar chart
    fig, ax = plt.subplots()
    ax.bar(emotions.keys(), emotions.values())
    ax.set_xlabel('Emotions')
    ax.set_ylabel('Scores')
    ax.set_title('Emotion Analysis')
    return fig





##Vocab Enhancement

In [86]:
import nltk
import torch
from transformers import pipeline, BertTokenizer, BertForMaskedLM
from nltk.corpus import wordnet
from collections import Counter
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the BERT tokenizer and model
tokens = BertTokenizer.from_pretrained('bert-base-uncased')
models = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Initialize the fill-mask pipeline
fill_mask = pipeline('fill-mask', model=models, tokenizer=tokens)

# Function to map POS tag to WordNet POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def get_contextual_synonyms(word, sentence):
    # Mask the target word in the sentence
    masked_sentence = sentence.replace(word, tokens.mask_token, 1)
    # Get the top predictions for the masked word
    predictions = fill_mask(masked_sentence)
    # Extract the predicted synonyms
    synonyms = [pred['token_str'] for pred in predictions if pred['token_str'] != word]
    return synonyms

def enhance_sentence(sentence, frequent_words, replacement_fraction):
    words = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)

    # Number of words to replace based on the replacement fraction
    num_replacements = int(len(frequent_words) * replacement_fraction)
    replacements = 0

    enhanced_words = []
    for word, pos in pos_tags:
        lower_word = word.lower()
        if lower_word in frequent_words and get_wordnet_pos(pos) and replacements < num_replacements:
            synonyms = get_contextual_synonyms(word, sentence)
            if synonyms:
                synonym = synonyms[0]  # Replace with the first synonym
                enhanced_words.append(synonym)
                replacements += 1
            else:
                enhanced_words.append(word)
        else:
            enhanced_words.append(word)

    enhanced_sentence = re.sub(r'(\s)([?.!,"](?:\s|$))', r'\2', ' '.join(enhanced_words))
    return enhanced_sentence

def enhance_vocabulary(paragraph, replacement_fraction=0.3):
    # Split paragraph into sentences
    sentences = nltk.sent_tokenize(paragraph)

    # Combine sentences for frequency analysis
    all_words = nltk.word_tokenize(paragraph)
    all_pos_tags = nltk.pos_tag(all_words)

    words_lower = [word.lower() for word in all_words]
    word_freq = Counter(words_lower)

    # Select frequent words (appearing more than once) for replacement
    frequent_words = {word for word, freq in word_freq.items() if freq > 1}

    # Enhance each sentence
    enhanced_sentences = [enhance_sentence(sentence, frequent_words, replacement_fraction) for sentence in sentences]

    # Reassemble the sentences into a single paragraph
    enhanced_paragraph = ' '.join(enhanced_sentences)

    return enhanced_paragraph

# Example usage
paragraph = "Your long paragraph text goes here. Make sure to include various sentences. Each sentence will be processed individually."
enhanced_paragraph = enhance_vocabulary(paragraph)
print(enhanced_paragraph)
enhance_vocabulary('Outside the museum, a street performer were playing the violen. People stops to listen and throw coins into his hat. He plays beautifully, and the music echos through the streets. A little girl dances to the music, and her parents smiles and take pictures. The performer thank everyone for their generosity and continues to play.')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identi

Your long paragraph text goes here. Make sure to include various sentences. Each sentence will be processed individually.


'Outside the museum, a street performer were playing the violen. People stops to listen and throw coins into his hat. He plays beautifully, and the music echos through the streets. A little girl dances to the music, and her parents smiles and take pictures. The band thank everyone for their generosity and continues to play.'

##Gradio Interface

In [87]:
!pip install gradio



In [88]:
import gradio as gr

In [91]:
def process_text(text, enhance, analyze):
    corrected_text = correct_grammar2(text)
    enhanced_text = enhance_vocabulary(corrected_text) if enhance else ""
    tone_analysis_plot = analyze_tone(corrected_text) if analyze else None

    return corrected_text, enhanced_text, tone_analysis_plot

iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(lines=10, placeholder="Enter text here..."),
        gr.Checkbox(label="Enhance Vocabulary"),
        gr.Checkbox(label="Analyze Tone")
    ],
    outputs=[
        gr.Textbox(label="Corrected Grammar"),
        gr.Textbox(label="Enhanced Vocabulary"),
        gr.Plot( label="Tone Analysis Plot")
    ],
    title="Grammar Autocorrect",
    description="Corrects grammar mistakes. Optionally give tone analysis and tries to enhance vocabulary."
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://85a84a9ded23f6a65d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


