In [175]:
text = "Cars. Cars hav been around since they became famaus in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major rolle in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do."


## I. Text Structure : 
###     1.1 Spelling Errors


In [176]:
# import packages
import nltk
import pandas as pd 
import string

from spellchecker import SpellChecker
from nltk import pos_tag
from nltk.tokenize import word_tokenize


# Download the NLTK data for POS tagging
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [177]:

# Handy functions 
def remove_punctuation(text):
    """
      Removes punctuation from text
    """
    translator = str.maketrans("", "", string.punctuation)
    text_without_punctuations = text.translate(translator)

    return text_without_punctuations






def correct_spelling(text):
    spell = SpellChecker()
    spelling_errors_count = 0
    text = remove_punctuation(text)
    words = word_tokenize(text)
    
    corrected_words = [spell.correction(word.lower()) if spell.correction(word.lower()) is not None else word for word in words]
    # Counting the number of words that have been corrected
    spelling_errors_count = sum(1 for original, corrected in zip(words, corrected_words) if original.lower() != corrected.lower() )
    corrected_text = ' '.join(corrected_words)

    return corrected_text, spelling_errors_count   # corrected_text is only to debug and check the output, we are interested in spelling_errors_count 

# Example usage


# Load dataset {test_tesxt.csv} I tried to generate a data set using CHATGBT, dataset consist of two columns {original_text, wrong_spelling}  pairs of origianl text and the same text with wrong spelling errors scatered in the text.
df = pd.read_csv('test_text.csv')



df['corrected_text'], df['spelling_errors_count'] = zip(*df['wrong_spelling'].apply(correct_spelling))  #  using * to unpack the zip into the two columns.


## This is Only for testing the output and confirm the results
for i in range(len(df)):

    print( "original: ",df['original_text'][i])
    print("Wrong: ",df['wrong_spelling'][i])
    print("Corrected: ", df['corrected_text'][i])
    print("Spelling Errors Count: ", df['spelling_errors_count'][i])
    print('')   







original:  The quick brown fox jumps over the lazy dog.
Wrong:  The quick brown fox jumps over the lazy dog.
Corrected:  the quick brown fox jumps over the lazy dog
Spelling Errors Count:  0

original:  Test Test Test Test
Wrong:   Test Test Test Test.
Corrected:  test test test test
Spelling Errors Count:  0

original:  The quick brown fox jumps over the lazy dog.
Wrong:  The quik brown fox jumps ovr the lazy dog.
Corrected:  the quit brown fox jumps or the lazy dog
Spelling Errors Count:  2

original:  Artificial intelligence is changing the world.
Wrong:  Artifical inteligence is changing the world.
Corrected:  artificial intelligence is changing the world
Spelling Errors Count:  2

original:  Programming languages are essential for software development.
Wrong:  Programing languages are essntial for software developement.
Corrected:  programing languages are essential for software development
Spelling Errors Count:  2

original:  Data analysis plays a crucial role in decision-making

In [178]:
# Final result of pre-processing of Spelling errors should look like this
# This block of code just to remove the columns that we don't need anymore ( were used just to visualise and debug) and re-arrange the columns
df['input_text'] = df['wrong_spelling'] 
del df['wrong_spelling']
del df['original_text']
df = df[['input_text', 'spelling_errors_count']]
df


Unnamed: 0,input_text,spelling_errors_count
0,The quick brown fox jumps over the lazy dog.,0
1,Test Test Test Test.,0
2,The quik brown fox jumps ovr the lazy dog.,2
3,Artifical inteligence is changing the world.,2
4,Programing languages are essntial for software...,2
5,Data anlysis plays a crucial role in decision-...,1
6,I love explorng new tecnologies and learning n...,2
7,The son sets in the west.,0
8,Codeing is a valuable skil.,2
9,Learnig new thing is always excting.,2


### 2.1 POS Tagging

### 

In [179]:
def generate_pos_tags(text):
    """
        Generates POS tags for the given text 
    """
    text = remove_punctuation(text)
    words = word_tokenize(text)

    pos_tags = pos_tag(words)

    return pos_tags

# Counting the number of words in the input text
def count_words(text):
    """
        Counts the number of words in the given text
    """
    clean_text = remove_punctuation(text)
    words = word_tokenize(clean_text)

    #print(words)
    return len(words)





# frequency of each POS tag in the text
from collections import Counter

def calc_frequency_pos_tags(text):
    tags = generate_pos_tags(text)
    total_words = count_words(text)
    freq_pos_tags = Counter(tag for word, tag in tags)  # This here returns a dictionary with the count of each POS tag in the text
    freq_pos_tags_dict = dict(freq_pos_tags) # Convert the Counter object to a dictionary
    for key, value in freq_pos_tags_dict.items(): # Calculate the frequency of each POS tag
        freq_pos_tags_dict[key] = (value/total_words)*100  # if we do round ()  total frequency will be under or over 100% due to rounding errors

    return freq_pos_tags_dict


# test on dataset

#df['POS tags'] = df['input_text'].apply(lambda x: generate_pos_tags(x)) # I left this until we decide if we need it or not because i have q question about it, => Should we do a list of tuples of POS  tags of sentences instead of individual words? like this [("DT","JJ","NN"),("D","N","V")] ???  because we have N-grams later which i guess holds more meaning than this.

df['POS tags Frequency'] = df['input_text'].apply(lambda x: calc_frequency_pos_tags(x))
df





### This to text on initial text it is defined in the begining of this file, it was used as initial testing text.
print(" frequency of each POS tag in the text:", calc_frequency_pos_tags(text))

df['Total frequency'] = df['POS tags Frequency'].apply(lambda x: sum(x.values())) # This is just to test if we have loss of frequency, it should be 100% for each row


df


 frequency of each POS tag in the text: {'NNS': 4.6875, 'NNP': 7.8125, 'VBD': 6.25, 'VBN': 3.125, 'IN': 10.9375, 'PRP': 3.125, 'NN': 12.5, 'DT': 10.9375, 'CD': 1.5625, 'WRB': 1.5625, 'CC': 3.125, 'JJ': 7.8125, 'VBP': 3.125, 'PRP$': 1.5625, 'VBZ': 1.5625, 'RB': 3.125, 'VBG': 3.125, 'TO': 4.6875, 'VB': 6.25, 'MD': 3.125}


Unnamed: 0,input_text,spelling_errors_count,POS tags Frequency,Total frequency
0,The quick brown fox jumps over the lazy dog.,0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...",100.0
1,Test Test Test Test.,0,"{'NNP': 75.0, 'NN': 25.0}",100.0
2,The quik brown fox jumps ovr the lazy dog.,2,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...",100.0
3,Artifical inteligence is changing the world.,2,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...",100.0
4,Programing languages are essntial for software...,2,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...",100.0
5,Data anlysis plays a crucial role in decision-...,1,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...",100.0
6,I love explorng new tecnologies and learning n...,2,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...",100.0
7,The son sets in the west.,0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...",100.0
8,Codeing is a valuable skil.,2,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...",100.0
9,Learnig new thing is always excting.,2,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...",100.0


In [180]:
# Droping the columns that we don't need anymore ['Total frequency']
del df['Total frequency']

### 2.1.1 N-grams

In [181]:
from nltk import ngrams

def generate_ngrams_pos(tokens, n):
    """
        Generates n-grams with POS tags for the given tokens
    """
    text = remove_punctuation(tokens)
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ngrams_result = list(ngrams(words, n))
   
    
    # Combine n-grams and POS tags
    ngrams_result = [(pos_tags[i][1], pos_tags[i+1][1]) for i in range(len(pos_tags)-1)]
    
    return ngrams_result


# test on dataset
df['2-grams'] = df['input_text'].apply(lambda x: generate_ngrams_pos(x, 2))

# To see results clearly 
for i in range(len(df)):

 
    print("Input_text: ",df['input_text'][i])
    print("Spelling Errors Count: ", df['spelling_errors_count'][i])
    print('')   
    print("POS tags Frequency: ", df['POS tags Frequency'][i])

    print("2-grams: ", df['2-grams'][i])

df

Input_text:  The quick brown fox jumps over the lazy dog.
Spelling Errors Count:  0

POS tags Frequency:  {'DT': 22.22222222222222, 'JJ': 22.22222222222222, 'NN': 33.33333333333333, 'VBZ': 11.11111111111111, 'IN': 11.11111111111111}
2-grams:  [('DT', 'JJ'), ('JJ', 'NN'), ('NN', 'NN'), ('NN', 'VBZ'), ('VBZ', 'IN'), ('IN', 'DT'), ('DT', 'JJ'), ('JJ', 'NN')]
Input_text:   Test Test Test Test.
Spelling Errors Count:  0

POS tags Frequency:  {'NNP': 75.0, 'NN': 25.0}
2-grams:  [('NNP', 'NNP'), ('NNP', 'NNP'), ('NNP', 'NN')]
Input_text:  The quik brown fox jumps ovr the lazy dog.
Spelling Errors Count:  2

POS tags Frequency:  {'DT': 22.22222222222222, 'JJ': 22.22222222222222, 'NN': 33.33333333333333, 'NNS': 11.11111111111111, 'VBP': 11.11111111111111}
2-grams:  [('DT', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN'), ('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'DT'), ('DT', 'NN'), ('NN', 'NN')]
Input_text:  Artifical inteligence is changing the world.
Spelling Errors Count:  2

POS tags Frequency:  {'JJ': 16.

Unnamed: 0,input_text,spelling_errors_count,POS tags Frequency,2-grams
0,The quick brown fox jumps over the lazy dog.,0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, NN), (NN, NN), (NN, VBZ), (VBZ..."
1,Test Test Test Test.,0,"{'NNP': 75.0, 'NN': 25.0}","[(NNP, NNP), (NNP, NNP), (NNP, NN)]"
2,The quik brown fox jumps ovr the lazy dog.,2,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, JJ), (JJ, NN), (NN, NNS), (NNS..."
3,Artifical inteligence is changing the world.,2,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...","[(JJ, NN), (NN, VBZ), (VBZ, VBG), (VBG, DT), (..."
4,Programing languages are essntial for software...,2,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...","[(VBG, NNS), (NNS, VBP), (VBP, JJ), (JJ, IN), ..."
5,Data anlysis plays a crucial role in decision-...,1,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...","[(NNP, NN), (NN, VBZ), (VBZ, DT), (DT, JJ), (J..."
6,I love explorng new tecnologies and learning n...,2,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...","[(PRP, VBP), (VBP, RB), (RB, JJ), (JJ, NNS), (..."
7,The son sets in the west.,0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...","[(DT, NN), (NN, VBZ), (VBZ, IN), (IN, DT), (DT..."
8,Codeing is a valuable skil.,2,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...","[(NNP, VBZ), (VBZ, DT), (DT, JJ), (JJ, NN)]"
9,Learnig new thing is always excting.,2,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...","[(NNP, JJ), (JJ, NN), (NN, VBZ), (VBZ, RB), (R..."


In [182]:
# Calculate the frequency of each n-gram with POS tags

def calc_frequency_ngrams(tokens, n):
    ngrams = generate_ngrams_pos(tokens, n)
    total_ngrams = len(ngrams)
    freq_pos_tags = Counter(ngram for ngram in ngrams)  # This here returns a dictionary with the count of each POS tag in the text
    freq_ngrams_dict = dict(freq_pos_tags) # Convert the Counter object to a dictionary
    for key, value in freq_ngrams_dict.items(): # Calculate the frequency of each POS tag
        freq_ngrams_dict[key] = (value/total_ngrams)*100  # if we do round ()  total frequency will be under or over 100% due to rounding errors

    return freq_ngrams_dict



# test on dataset
df['2-grams frequency'] = df['input_text'].apply(lambda x: calc_frequency_ngrams(x, 2))
df['ngrams_frequency_total'] = df['2-grams frequency'].apply(lambda x: sum(x.values())) # This is just to test if we have loss of frequency, it should be 100% for each row


df

Unnamed: 0,input_text,spelling_errors_count,POS tags Frequency,2-grams,2-grams frequency,ngrams_frequency_total
0,The quick brown fox jumps over the lazy dog.,0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, NN), (NN, NN), (NN, VBZ), (VBZ...","{('DT', 'JJ'): 25.0, ('JJ', 'NN'): 25.0, ('NN'...",100.0
1,Test Test Test Test.,0,"{'NNP': 75.0, 'NN': 25.0}","[(NNP, NNP), (NNP, NNP), (NNP, NN)]","{('NNP', 'NNP'): 66.66666666666666, ('NNP', 'N...",100.0
2,The quik brown fox jumps ovr the lazy dog.,2,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, JJ), (JJ, NN), (NN, NNS), (NNS...","{('DT', 'JJ'): 12.5, ('JJ', 'JJ'): 12.5, ('JJ'...",100.0
3,Artifical inteligence is changing the world.,2,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...","[(JJ, NN), (NN, VBZ), (VBZ, VBG), (VBG, DT), (...","{('JJ', 'NN'): 20.0, ('NN', 'VBZ'): 20.0, ('VB...",100.0
4,Programing languages are essntial for software...,2,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...","[(VBG, NNS), (NNS, VBP), (VBP, JJ), (JJ, IN), ...","{('VBG', 'NNS'): 16.666666666666664, ('NNS', '...",100.0
5,Data anlysis plays a crucial role in decision-...,1,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...","[(NNP, NN), (NN, VBZ), (VBZ, DT), (DT, JJ), (J...","{('NNP', 'NN'): 14.285714285714285, ('NN', 'VB...",100.0
6,I love explorng new tecnologies and learning n...,2,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...","[(PRP, VBP), (VBP, RB), (RB, JJ), (JJ, NNS), (...","{('PRP', 'VBP'): 12.5, ('VBP', 'RB'): 12.5, ('...",100.0
7,The son sets in the west.,0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...","[(DT, NN), (NN, VBZ), (VBZ, IN), (IN, DT), (DT...","{('DT', 'NN'): 40.0, ('NN', 'VBZ'): 20.0, ('VB...",100.0
8,Codeing is a valuable skil.,2,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...","[(NNP, VBZ), (VBZ, DT), (DT, JJ), (JJ, NN)]","{('NNP', 'VBZ'): 25.0, ('VBZ', 'DT'): 25.0, ('...",100.0
9,Learnig new thing is always excting.,2,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...","[(NNP, JJ), (JJ, NN), (NN, VBZ), (VBZ, RB), (R...","{('NNP', 'JJ'): 20.0, ('JJ', 'NN'): 20.0, ('NN...",100.0


In [183]:
# Droping the columns that we don't need anymore ['ngrams_frequency_total']
del df['ngrams_frequency_total']
