In [145]:
text = "Cars. Cars hav been around since they became famaus in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major rolle in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do."


## I. Text Structure : 
###     1.1 Spelling Errors


In [146]:
# import packages
import nltk
import pandas as pd 
import string

from spellchecker import SpellChecker
from nltk import pos_tag
from nltk.tokenize import word_tokenize


# Download the NLTK data for POS tagging
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [147]:

# Handy functions 
def remove_punctuation(text):
    """
      Removes punctuation from text
    """
    translator = str.maketrans("", "", string.punctuation)
    text_without_punctuations = text.translate(translator)

    return text_without_punctuations






def correct_spelling(text):
    spell = SpellChecker()
    spelling_errors_count = 0
    text = remove_punctuation(text)
    words = word_tokenize(text)
    
    corrected_words = [spell.correction(word.lower()) if spell.correction(word.lower()) is not None else word for word in words]
    # Counting the number of words that have been corrected
    for word in words: 
        if spell.correction(word.lower()) != word.lower():
            spelling_errors_count += 1
    
    corrected_text = ' '.join(corrected_words)
    spelling_errors_rate = spelling_errors_count/len(words)

    return corrected_text, spelling_errors_rate   # corrected_text is only to debug and check the output, we are interested in spelling_errors_count 

# Example usage


# Load dataset {test_tesxt.csv} I tried to generate a data set using CHATGBT, dataset consist of two columns {original_text, wrong_spelling}  pairs of origianl text and the same text with wrong spelling errors scatered in the text.
df = pd.read_csv('test_text.csv')



df['corrected_text'], df['spelling_errors_rate'] = zip(*df['wrong_spelling'].apply(correct_spelling))  #  using * to unpack the zip into the two columns.


## This is Only for testing the output and confirm the results
for i in range(len(df)):

    print( "original: ",df['original_text'][i])
    print("Wrong: ",df['wrong_spelling'][i])
    print("Corrected: ", df['corrected_text'][i])
    print("Spelling Errors Rate: ", df['spelling_errors_rate'][i])
    print('')   







original:  The quick brown fox jumps over the lazy dog.
Wrong:  The quick brown fox jumps over the lazy dog.
Corrected:  the quick brown fox jumps over the lazy dog
Spelling Errors Rate:  0.0

original:  Test Test Test Test
Wrong:   Test Test Test Test.
Corrected:  test test test test
Spelling Errors Rate:  0.0

original:  The quick brown fox jumps over the lazy dog.
Wrong:  The quik brown fox jumps ovr the lazy dog.
Corrected:  the quit brown fox jumps or the lazy dog
Spelling Errors Rate:  0.2222222222222222

original:  Artificial intelligence is changing the world.
Wrong:  Artifical inteligence is changing the world.
Corrected:  artificial intelligence is changing the world
Spelling Errors Rate:  0.3333333333333333

original:  Programming languages are essential for software development.
Wrong:  Programing languages are essntial for software developement.
Corrected:  programing languages are essential for software development
Spelling Errors Rate:  0.2857142857142857

original:  Dat

In [148]:
# Final result of pre-processing of Spelling errors should look like this
# This block of code just to remove the columns that we don't need anymore ( were used just to visualise and debug) and re-arrange the columns
df['input_text'] = df['wrong_spelling'] 
del df['wrong_spelling']
del df['original_text']
df = df[['input_text', 'spelling_errors_rate']]
df


Unnamed: 0,input_text,spelling_errors_rate
0,The quick brown fox jumps over the lazy dog.,0.0
1,Test Test Test Test.,0.0
2,The quik brown fox jumps ovr the lazy dog.,0.222222
3,Artifical inteligence is changing the world.,0.333333
4,Programing languages are essntial for software...,0.285714
5,Data anlysis plays a crucial role in decision-...,0.25
6,I love explorng new tecnologies and learning n...,0.222222
7,The son sets in the west.,0.0
8,Codeing is a valuable skil.,0.4
9,Learnig new thing is always excting.,0.333333


### 2.1 POS Tagging

### 

In [149]:
def generate_pos_tags(text):
    """
        Generates POS tags for the given text 
    """
    text = remove_punctuation(text)
    words = word_tokenize(text)

    pos_tags = pos_tag(words)

    return pos_tags

# Counting the number of words in the input text
def count_words(text):
    """
        Counts the number of words in the given text
    """
    clean_text = remove_punctuation(text)
    words = word_tokenize(clean_text)

    #print(words)
    return len(words)





# frequency of each POS tag in the text
from collections import Counter

def calc_frequency_pos_tags(text):
    tags = generate_pos_tags(text)
    total_words = count_words(text)
    freq_pos_tags = Counter(tag for word, tag in tags)  # This here returns a dictionary with the count of each POS tag in the text
    freq_pos_tags_dict = dict(freq_pos_tags) # Convert the Counter object to a dictionary
    for key, value in freq_pos_tags_dict.items(): # Calculate the frequency of each POS tag
        freq_pos_tags_dict[key] = (value/total_words)*100  # if we do round ()  total frequency will be under or over 100% due to rounding errors

    return freq_pos_tags_dict


# test on dataset

#df['POS tags'] = df['input_text'].apply(lambda x: generate_pos_tags(x)) # I left this until we decide if we need it or not because i have q question about it, => Should we do a list of tuples of POS  tags of sentences instead of individual words? like this [("DT","JJ","NN"),("D","N","V")] ???  because we have N-grams later which i guess holds more meaning than this.

df['POS tags Frequency'] = df['input_text'].apply(lambda x: calc_frequency_pos_tags(x))
df





### This to text on initial text it is defined in the begining of this file, it was used as initial testing text.
print(" frequency of each POS tag in the text:", calc_frequency_pos_tags(text))

df['Total frequency'] = df['POS tags Frequency'].apply(lambda x: sum(x.values())) # This is just to test if we have loss of frequency, it should be 100% for each row


df


 frequency of each POS tag in the text: {'NNS': 4.6875, 'NNP': 7.8125, 'VBD': 6.25, 'VBN': 3.125, 'IN': 10.9375, 'PRP': 3.125, 'NN': 12.5, 'DT': 10.9375, 'CD': 1.5625, 'WRB': 1.5625, 'CC': 3.125, 'JJ': 7.8125, 'VBP': 3.125, 'PRP$': 1.5625, 'VBZ': 1.5625, 'RB': 3.125, 'VBG': 3.125, 'TO': 4.6875, 'VB': 6.25, 'MD': 3.125}


Unnamed: 0,input_text,spelling_errors_rate,POS tags Frequency,Total frequency
0,The quick brown fox jumps over the lazy dog.,0.0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...",100.0
1,Test Test Test Test.,0.0,"{'NNP': 75.0, 'NN': 25.0}",100.0
2,The quik brown fox jumps ovr the lazy dog.,0.222222,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...",100.0
3,Artifical inteligence is changing the world.,0.333333,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...",100.0
4,Programing languages are essntial for software...,0.285714,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...",100.0
5,Data anlysis plays a crucial role in decision-...,0.25,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...",100.0
6,I love explorng new tecnologies and learning n...,0.222222,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...",100.0
7,The son sets in the west.,0.0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...",100.0
8,Codeing is a valuable skil.,0.4,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...",100.0
9,Learnig new thing is always excting.,0.333333,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...",100.0


In [150]:
# Droping the columns that we don't need anymore ['Total frequency']
del df['Total frequency']

### 2.1.1 N-grams

In [151]:
from nltk import ngrams

def generate_ngrams_pos(tokens, n):
    """
        Generates n-grams with POS tags for the given tokens
    """
    text = remove_punctuation(tokens)
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ngrams_result = list(ngrams(words, n))
   
    
    # Combine n-grams and POS tags
    ngrams_result = [(pos_tags[i][1], pos_tags[i+1][1]) for i in range(len(pos_tags)-1)]
    
    return ngrams_result


# test on dataset
df['2-grams'] = df['input_text'].apply(lambda x: generate_ngrams_pos(x, 2))

# To see results clearly 
for i in range(len(df)):

 
    print("Input_text: ",df['input_text'][i])
    print("Spelling Errors Rate: ", df['spelling_errors_rate'][i])
    print('')   
    print("POS tags Frequency: ", df['POS tags Frequency'][i])

    print("2-grams: ", df['2-grams'][i])

df

Input_text:  The quick brown fox jumps over the lazy dog.
Spelling Errors Rate:  0.0

POS tags Frequency:  {'DT': 22.22222222222222, 'JJ': 22.22222222222222, 'NN': 33.33333333333333, 'VBZ': 11.11111111111111, 'IN': 11.11111111111111}
2-grams:  [('DT', 'JJ'), ('JJ', 'NN'), ('NN', 'NN'), ('NN', 'VBZ'), ('VBZ', 'IN'), ('IN', 'DT'), ('DT', 'JJ'), ('JJ', 'NN')]
Input_text:   Test Test Test Test.
Spelling Errors Rate:  0.0

POS tags Frequency:  {'NNP': 75.0, 'NN': 25.0}
2-grams:  [('NNP', 'NNP'), ('NNP', 'NNP'), ('NNP', 'NN')]
Input_text:  The quik brown fox jumps ovr the lazy dog.
Spelling Errors Rate:  0.2222222222222222

POS tags Frequency:  {'DT': 22.22222222222222, 'JJ': 22.22222222222222, 'NN': 33.33333333333333, 'NNS': 11.11111111111111, 'VBP': 11.11111111111111}
2-grams:  [('DT', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN'), ('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'DT'), ('DT', 'NN'), ('NN', 'NN')]
Input_text:  Artifical inteligence is changing the world.
Spelling Errors Rate:  0.333333333333333

Unnamed: 0,input_text,spelling_errors_rate,POS tags Frequency,2-grams
0,The quick brown fox jumps over the lazy dog.,0.0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, NN), (NN, NN), (NN, VBZ), (VBZ..."
1,Test Test Test Test.,0.0,"{'NNP': 75.0, 'NN': 25.0}","[(NNP, NNP), (NNP, NNP), (NNP, NN)]"
2,The quik brown fox jumps ovr the lazy dog.,0.222222,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, JJ), (JJ, NN), (NN, NNS), (NNS..."
3,Artifical inteligence is changing the world.,0.333333,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...","[(JJ, NN), (NN, VBZ), (VBZ, VBG), (VBG, DT), (..."
4,Programing languages are essntial for software...,0.285714,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...","[(VBG, NNS), (NNS, VBP), (VBP, JJ), (JJ, IN), ..."
5,Data anlysis plays a crucial role in decision-...,0.25,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...","[(NNP, NN), (NN, VBZ), (VBZ, DT), (DT, JJ), (J..."
6,I love explorng new tecnologies and learning n...,0.222222,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...","[(PRP, VBP), (VBP, RB), (RB, JJ), (JJ, NNS), (..."
7,The son sets in the west.,0.0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...","[(DT, NN), (NN, VBZ), (VBZ, IN), (IN, DT), (DT..."
8,Codeing is a valuable skil.,0.4,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...","[(NNP, VBZ), (VBZ, DT), (DT, JJ), (JJ, NN)]"
9,Learnig new thing is always excting.,0.333333,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...","[(NNP, JJ), (JJ, NN), (NN, VBZ), (VBZ, RB), (R..."


In [152]:
# Calculate the frequency of each n-gram with POS tags

def calc_frequency_ngrams(tokens, n):
    ngrams = generate_ngrams_pos(tokens, n)
    total_ngrams = len(ngrams)
    freq_pos_tags = Counter(ngram for ngram in ngrams)  # This here returns a dictionary with the count of each POS tag in the text
    freq_ngrams_dict = dict(freq_pos_tags) # Convert the Counter object to a dictionary
    for key, value in freq_ngrams_dict.items(): # Calculate the frequency of each POS tag
        freq_ngrams_dict[key] = (value/total_ngrams)*100  # if we do round ()  total frequency will be under or over 100% due to rounding errors

    return freq_ngrams_dict



# test on dataset
df['2-grams frequency'] = df['input_text'].apply(lambda x: calc_frequency_ngrams(x, 2))
df['ngrams_frequency_total'] = df['2-grams frequency'].apply(lambda x: sum(x.values())) # This is just to test if we have loss of frequency, it should be 100% for each row


df

Unnamed: 0,input_text,spelling_errors_rate,POS tags Frequency,2-grams,2-grams frequency,ngrams_frequency_total
0,The quick brown fox jumps over the lazy dog.,0.0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, NN), (NN, NN), (NN, VBZ), (VBZ...","{('DT', 'JJ'): 25.0, ('JJ', 'NN'): 25.0, ('NN'...",100.0
1,Test Test Test Test.,0.0,"{'NNP': 75.0, 'NN': 25.0}","[(NNP, NNP), (NNP, NNP), (NNP, NN)]","{('NNP', 'NNP'): 66.66666666666666, ('NNP', 'N...",100.0
2,The quik brown fox jumps ovr the lazy dog.,0.222222,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, JJ), (JJ, NN), (NN, NNS), (NNS...","{('DT', 'JJ'): 12.5, ('JJ', 'JJ'): 12.5, ('JJ'...",100.0
3,Artifical inteligence is changing the world.,0.333333,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...","[(JJ, NN), (NN, VBZ), (VBZ, VBG), (VBG, DT), (...","{('JJ', 'NN'): 20.0, ('NN', 'VBZ'): 20.0, ('VB...",100.0
4,Programing languages are essntial for software...,0.285714,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...","[(VBG, NNS), (NNS, VBP), (VBP, JJ), (JJ, IN), ...","{('VBG', 'NNS'): 16.666666666666664, ('NNS', '...",100.0
5,Data anlysis plays a crucial role in decision-...,0.25,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...","[(NNP, NN), (NN, VBZ), (VBZ, DT), (DT, JJ), (J...","{('NNP', 'NN'): 14.285714285714285, ('NN', 'VB...",100.0
6,I love explorng new tecnologies and learning n...,0.222222,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...","[(PRP, VBP), (VBP, RB), (RB, JJ), (JJ, NNS), (...","{('PRP', 'VBP'): 12.5, ('VBP', 'RB'): 12.5, ('...",100.0
7,The son sets in the west.,0.0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...","[(DT, NN), (NN, VBZ), (VBZ, IN), (IN, DT), (DT...","{('DT', 'NN'): 40.0, ('NN', 'VBZ'): 20.0, ('VB...",100.0
8,Codeing is a valuable skil.,0.4,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...","[(NNP, VBZ), (VBZ, DT), (DT, JJ), (JJ, NN)]","{('NNP', 'VBZ'): 25.0, ('VBZ', 'DT'): 25.0, ('...",100.0
9,Learnig new thing is always excting.,0.333333,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...","[(NNP, JJ), (JJ, NN), (NN, VBZ), (VBZ, RB), (R...","{('NNP', 'JJ'): 20.0, ('JJ', 'NN'): 20.0, ('NN...",100.0


In [153]:
# Droping the columns that we don't need anymore ['ngrams_frequency_total']
del df['ngrams_frequency_total']
df

Unnamed: 0,input_text,spelling_errors_rate,POS tags Frequency,2-grams,2-grams frequency
0,The quick brown fox jumps over the lazy dog.,0.0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, NN), (NN, NN), (NN, VBZ), (VBZ...","{('DT', 'JJ'): 25.0, ('JJ', 'NN'): 25.0, ('NN'..."
1,Test Test Test Test.,0.0,"{'NNP': 75.0, 'NN': 25.0}","[(NNP, NNP), (NNP, NNP), (NNP, NN)]","{('NNP', 'NNP'): 66.66666666666666, ('NNP', 'N..."
2,The quik brown fox jumps ovr the lazy dog.,0.222222,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, JJ), (JJ, NN), (NN, NNS), (NNS...","{('DT', 'JJ'): 12.5, ('JJ', 'JJ'): 12.5, ('JJ'..."
3,Artifical inteligence is changing the world.,0.333333,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...","[(JJ, NN), (NN, VBZ), (VBZ, VBG), (VBG, DT), (...","{('JJ', 'NN'): 20.0, ('NN', 'VBZ'): 20.0, ('VB..."
4,Programing languages are essntial for software...,0.285714,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...","[(VBG, NNS), (NNS, VBP), (VBP, JJ), (JJ, IN), ...","{('VBG', 'NNS'): 16.666666666666664, ('NNS', '..."
5,Data anlysis plays a crucial role in decision-...,0.25,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...","[(NNP, NN), (NN, VBZ), (VBZ, DT), (DT, JJ), (J...","{('NNP', 'NN'): 14.285714285714285, ('NN', 'VB..."
6,I love explorng new tecnologies and learning n...,0.222222,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...","[(PRP, VBP), (VBP, RB), (RB, JJ), (JJ, NNS), (...","{('PRP', 'VBP'): 12.5, ('VBP', 'RB'): 12.5, ('..."
7,The son sets in the west.,0.0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...","[(DT, NN), (NN, VBZ), (VBZ, IN), (IN, DT), (DT...","{('DT', 'NN'): 40.0, ('NN', 'VBZ'): 20.0, ('VB..."
8,Codeing is a valuable skil.,0.4,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...","[(NNP, VBZ), (VBZ, DT), (DT, JJ), (JJ, NN)]","{('NNP', 'VBZ'): 25.0, ('VBZ', 'DT'): 25.0, ('..."
9,Learnig new thing is always excting.,0.333333,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...","[(NNP, JJ), (JJ, NN), (NN, VBZ), (VBZ, RB), (R...","{('NNP', 'JJ'): 20.0, ('JJ', 'NN'): 20.0, ('NN..."


## 2.c. Length of phrases

In [157]:
def analyze_length_text(text):
    """
        Analyzes the length of the given text
        -Take punctuations into account when calculating the length
    """

    phrases = nltk.sent_tokenize(text)
    phrases = [phrase.strip() for phrase in phrases if phrase.strip() ]
    phrases_lengths = [len(phrase) for phrase in phrases]

    lengths = [len(phrase) for phrase in phrases]
    max_length = max(lengths)
    min_length = min(lengths)
    avg_length = sum(lengths) / len(lengths)

    return phrases_lengths, max_length, min_length, avg_length

example_text = 'The second advantage that would come with limiting car use is less stress. Everyone knows that driving a car causes emence amounts of stress. Getting caught in traffic is a major cause of stress in someones life. having to repeating wash your car just to get it dirt again causes stress. Having people in the iack of your car screaming and yelling all while music is ilasting, causes stress. So oiviously driving causes stress. If we were to limit our car usage we would not ie as stressed as we usually are. There would ie no traffic, no car washes and no one screaming in a small confineded space. In the first article " In German Suiuri, life goes on without cars", iy Elisaieth Rosenthal, a citizen named humdrum Walter, states " When i had a car i was always tense. I\'m much happier this way". So with out the stress of a car humdrum Walter is a looser and happier person, less stress equals happier person. In the third article, " Carfree dai is spinning into a iig hit in Bogota", iy Andrew Selsky, it states " It\'s a good opportunity to take away stress...". If we have the opportunity to take away stress, why not take it. It is a huge advantage in our lives to limit driving if it takes away stress. No one wants stress, no one needs stress, and if we have an opportunity to take some of the stress away, take that opportunity.\r\n'
phrases_lengths, max_len, min_len, avg_len= analyze_length_text(example_text)

# Print the results
print(f"Maximum phrase length: {max_len} characters")
print(f"Minimum phrase length: {min_len} characters")
print(f"Average phrase length: {avg_len:.2f} characters")
print(f"Phrases lengths: {phrases_lengths}")


# apply on dataset
df['phrases_lengths'], df['phrase_max_length'], df['phrase_min_length'], df['phrase_avg_length'] = zip(*df['input_text'].apply(analyze_length_text))  


Maximum phrase length: 170 characters
Minimum phrase length: 27 characters
Average phrase length: 88.20 characters
Phrases lengths: [74, 66, 70, 74, 103, 35, 80, 90, 170, 27, 113, 153, 64, 77, 127]


In [155]:
df['input_text'].iloc[-1]

'The second advantage that would come with limiting car use is less stress. Everyone knows that driving a car causes emence amounts of stress. Getting caught in traffic is a major cause of stress in someones life. having to repeating wash your car just to get it dirt again causes stress. Having people in the iack of your car screaming and yelling all while music is ilasting, causes stress. So oiviously driving causes stress. If we were to limit our car usage we would not ie as stressed as we usually are. There would ie no traffic, no car washes and no one screaming in a small confineded space. In the first article " In German Suiuri, life goes on without cars", iy Elisaieth Rosenthal, a citizen named humdrum Walter, states " When i had a car i was always tense. I\'m much happier this way". So with out the stress of a car humdrum Walter is a looser and happier person, less stress equals happier person. In the third article, " Carfree dai is spinning into a iig hit in Bogota", iy Andrew 

## 2.d Vocabulary Richness

In [156]:
from math import sqrt

def calc_TTR(text):
    """
        Calculates the STTR of the given text (Type-Token Ratio)
    """

    tokens = word_tokenize(text.lower())
    types = len(set(tokens))
    tokens_count = len(tokens)
    TTR = types/tokens_count

    return TTR

def calculate_STTR(tokens):
    """
        Calculates the STTR of the given text ( Standarized Type-Token Ratio)
    """
    ttr = calc_TTR(tokens)
    sttr = ttr / sqrt(2 * len(tokens))
    return sttr


# test on dataset

df['STTR'] = df['input_text'].apply(lambda x: calculate_STTR(x))

df


Unnamed: 0,input_text,spelling_errors_rate,POS tags Frequency,2-grams,2-grams frequency,phrases_lengths,phrase_max_length,phrase_min_length,phrase_avg_length,STTR
0,The quick brown fox jumps over the lazy dog.,0.0,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, NN), (NN, NN), (NN, VBZ), (VBZ...","{('DT', 'JJ'): 25.0, ('JJ', 'NN'): 25.0, ('NN'...",[44],44,44,44.0,0.09594
1,Test Test Test Test.,0.0,"{'NNP': 75.0, 'NN': 25.0}","[(NNP, NNP), (NNP, NNP), (NNP, NN)]","{('NNP', 'NNP'): 66.66666666666666, ('NNP', 'N...",[20],20,20,20.0,0.061721
2,The quik brown fox jumps ovr the lazy dog.,0.222222,"{'DT': 22.22222222222222, 'JJ': 22.22222222222...","[(DT, JJ), (JJ, JJ), (JJ, NN), (NN, NNS), (NNS...","{('DT', 'JJ'): 12.5, ('JJ', 'JJ'): 12.5, ('JJ'...",[42],42,42,42.0,0.098198
3,Artifical inteligence is changing the world.,0.333333,"{'JJ': 16.666666666666664, 'NN': 33.3333333333...","[(JJ, NN), (NN, VBZ), (VBZ, VBG), (VBG, DT), (...","{('JJ', 'NN'): 20.0, ('NN', 'VBZ'): 20.0, ('VB...",[44],44,44,44.0,0.1066
4,Programing languages are essntial for software...,0.285714,"{'VBG': 14.285714285714285, 'NNS': 14.28571428...","[(VBG, NNS), (NNS, VBP), (VBP, JJ), (JJ, IN), ...","{('VBG', 'NNS'): 16.666666666666664, ('NNS', '...",[60],60,60,60.0,0.091287
5,Data anlysis plays a crucial role in decision-...,0.25,"{'NNP': 12.5, 'NN': 25.0, 'VBZ': 12.5, 'DT': 1...","[(NNP, NN), (NN, VBZ), (VBZ, DT), (DT, JJ), (J...","{('NNP', 'NN'): 14.285714285714285, ('NN', 'VB...",[53],53,53,53.0,0.097129
6,I love explorng new tecnologies and learning n...,0.222222,"{'PRP': 11.11111111111111, 'VBP': 11.111111111...","[(PRP, VBP), (VBP, RB), (RB, JJ), (JJ, NNS), (...","{('PRP', 'VBP'): 12.5, ('VBP', 'RB'): 12.5, ('...",[56],56,56,56.0,0.085042
7,The son sets in the west.,0.0,"{'DT': 33.33333333333333, 'NN': 33.33333333333...","[(DT, NN), (NN, VBZ), (VBZ, IN), (IN, DT), (DT...","{('DT', 'NN'): 40.0, ('NN', 'VBZ'): 20.0, ('VB...",[25],25,25,25.0,0.121218
8,Codeing is a valuable skil.,0.4,"{'NNP': 20.0, 'VBZ': 20.0, 'DT': 20.0, 'JJ': 2...","[(NNP, VBZ), (VBZ, DT), (DT, JJ), (JJ, NN)]","{('NNP', 'VBZ'): 25.0, ('VBZ', 'DT'): 25.0, ('...",[27],27,27,27.0,0.136083
9,Learnig new thing is always excting.,0.333333,"{'NNP': 16.666666666666664, 'JJ': 16.666666666...","[(NNP, JJ), (JJ, NN), (NN, VBZ), (VBZ, RB), (R...","{('NNP', 'JJ'): 20.0, ('JJ', 'NN'): 20.0, ('NN...",[36],36,36,36.0,0.117851
