## README
This Jupyter Notebook performs the following tasks:

1. **Import Libraries**: Imports necessary libraries for data manipulation, text processing, and multiprocessing.
   
2. **Get Data**: Reads the input data from a CSV file.

3. **Remove Specific Texts**: Removes texts with specific indexes from the  (found duplicates during analysis)dataset.
   
4. **Transform Data**:
   - **Standardizing Contractions**: Expands common contractions using a predefined dictionary.
   - **Removing HTML Tags**: Eliminates HTML tags to retain only plain text.
   - **Removing Special Characters and Punctuation**: Cleanses text by removing special characters and punctuation.
   - **Removing Words with Numbers**: Removes words containing numbers and any trailing 's.
   - **Removing Stop Words**: Eliminates common stop words.
   - **Removing Non-ASCII Characters**: Removes non-ASCII characters, including emojis.
   - **Tokenization and Lemmatization**: Applies NLTK's tokenization and lemmatization using WordNet POS tags.
   - **Identifying Misspelt Words**: Identifies and counts misspelled words using a spell checker.
   
5. **Export Data**: Exports the processed data to a CSV file.


## Content
  - [1. Import Libraries](#1-Import-Libraries)
  - [2. Get Data](#2-Get-Data)
  - [3. Remove Specific Texts](#3-Remove-Specific-Texts)
  - [4. Transform Data](#4-Transform-Data)
    - [Standardizing Contractions](#Standardizing-Contractions)
    - [Removing HTML Tags](#Removing-HTML-Tags)
    - [Removing Special Characters and Punctuation](#Removing-Special-Characters-and-Punctuation)
    - [Removing Words with Numbers](#Removing-Words-with-Numbers)
    - [Removing Stop Words](#Removing-Stop-Words)
    - [Removing Non-ASCII Characters](#Removing-Non-ASCII-Characters)
    - [Tokenization and Lemmatization](#Tokenization-and-Lemmatization)
    - [Identifying Misspelt Words](#Identifying-Misspelt-Words)
  - [5. Export Data](#5-Export-Data)


In [1]:
# 1. Import Libraries
import pandas as pd
import re
import string
import unicodedata
import time
import multiprocessing
from joblib import Parallel, delayed
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker
import textstat

# Get the number of available CPU cores
num_cores = multiprocessing.cpu_count()
print(f"Number of available CPU cores: {num_cores}")

# 2. Get Data
df = pd.read_csv('train_split.csv')

#3. Remove texts with specific indexes
indexes_to_remove = [789, 3109] # indexes were found in duplicates Notebook
df = df.drop(indexes_to_remove).reset_index(drop=True)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Load stopwords
stop_words = set(stopwords.words('english'))

# Contractions dictionary
contractions = {
    "aren't": "are not", "can't": "cannot", "could've": "could have", "couldn't": "could not", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is", "i'd": "i would", "i'll": "i will", "i'm": "i am",
    "i've": "i have", "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is", "might've": "might have",
    "mightn't": "might not", "must've": "must have", "mustn't": "must not", "shan't": "shall not", "she'd": "she would",
    "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "that'd": "that would",
    "that's": "that is", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are",
    "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'll": "what will", "what're": "what are", "what's": "what is",
    "what've": "what have", "where's": "where is", "who'd": "who would", "who'll": "who will", "who're": "who are",
    "who's": "who is", "who've": "who have", "won't": "will not", "would've": "would have", "wouldn't": "would not",
    "you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have", "let's": "let us",
    "here's": "here is", "how's": "how is",
    "Aren't": "Are not", "Can't": "Cannot", "Could've": "Could have", "Couldn't": "Could not", "Didn't": "Did not",
    "Doesn't": "Does not", "Don't": "Do not", "Hadn't": "Had not", "Hasn't": "Has not", "Haven't": "Have not",
    "He'd": "He would", "He'll": "He will", "He's": "He is", "I'd": "I would", "I'll": "I will", "I'm": "I am",
    "I've": "I have", "Isn't": "Is not", "It'd": "It would", "It'll": "It will", "It's": "It is", "Might've": "Might have",
    "Mightn't": "Might not", "Must've": "Must have", "Mustn't": "Must not", "Shan't": "Shall not", "She'd": "She would",
    "She'll": "She will", "She's": "She is", "Should've": "Should have", "Shouldn't": "Should not", "That'd": "That would",
    "That's": "That is", "There's": "There is", "They'd": "They would", "They'll": "They will", "They're": "They are",
    "They've": "They have", "Wasn't": "Was not", "We'd": "We would", "We'll": "We will", "We're": "We are",
    "We've": "We have", "Weren't": "Were not", "What'll": "What will", "What're": "What are", "What's": "What is",
    "What've": "What have", "Where's": "Where is", "Who'd": "Who would", "Who'll": "Who will", "Who're": "Who are",
    "Who's": "Who is", "Who've": "Who have", "Won't": "Will not", "Would've": "Would have", "Wouldn't": "Would not",
    "You'd": "You would", "You'll": "You will", "You're": "You are", "You've": "You have", "Let's": "Let us",
    "Here's": "Here is", "How's": "How is"
}

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to lemmatize text
def lemmatize_text(text):
    words_and_tags = pos_tag(word_tokenize(text))
    lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in words_and_tags]
    return ' '.join(lemmatized_words)

# Function to remove HTML tags
def removeHTML(x):
    html = re.compile(r'<.*?>')
    return html.sub(r'', x)

# Function to replace punctuation with space if absent
def replace_punctuation_with_space_if_absent(text):
    pattern = r'([.,!?;:]+)(?!\s)'
    corrected_text = re.sub(pattern, r'\1 ', text)
    return corrected_text

# Function to replace contractions
def replace_contractions(text, contractions_dict):
    contractions_re = re.compile('|'.join(map(re.escape, contractions_dict.keys())))
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Function to remove trailing 's
def remove_trailing_s(text):
    words = text.split()
    words = [word[:-2] if word.endswith("'s") else word for word in words]
    return ' '.join(words)

# Function to remove special characters and punctuation
def remove_special_characters_and_punctuation(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', normalized_text)
    return cleaned_text

# Function to replace multiple spaces with a single space
def replace_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text)

# Function to remove words with numbers
def remove_words_with_numbers(text):
    cleaned_text = re.sub(r'\b\w*\d\w*\b', '', text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

# Function to find transitional phrases in text
transitional_phrases = [
    'Above all', 'Accordingly', 'Additionally', 'After', 'After all', 'Afterward', 'All in all', 'Also', 'Alternatively', 
    'As a result', 'As an illustration', 'As long as', 'As mentioned earlier', 'As noted', 'At the same time', 'Before', 
    'Besides', 'But', 'By all means', 'Consequently', 'Conversely', 'Correspondingly', 'Despite', 'During', 'Even if', 
    'Even so', 'Especially', 'Eventually', 'Finally', 'First', 'For example', 'For instance', 'Furthermore', 'Hence', 
    'However', 'If', 'In addition', 'In brief', 'In case', 'In comparison', 'In conclusion', 'In fact', 'In contrast', 
    'In other words', 'In particular', 'In simpler terms', 'In summary', 'In the meantime', 'In the same way', 'Indeed', 
    'Instead', 'Lastly', 'Later', 'Likewise', 'Meanwhile', 'Moreover', 'More importantly', 'Namely', 'Nevertheless', 
    'Next', 'Nonetheless', 'Notably', 'Now', 'On the contrary', 'On condition that', 'On one hand', 'On the other hand', 
    'Overall', 'Particularly', 'Plus', 'Previously', 'Provided that', 'Regardless', 'Second', 'Similarly', 'Since', 
    'Specifically', 'Still', 'Subsequently', 'That is', 'Then', 'Therefore', 'Third', 'Thus', 'To clarify', 'To conclude', 
    'To demonstrate', 'To illustrate', 'To put it another way', 'To summarize', 'To sum up', 'Ultimately', 'Unless', 'Unlike', 
    'Until', 'Whereas', 'Yet', 'Above and beyond', 'According to', 'After a while', 'All things considered', 'Although', 
    'Another key point', 'As a consequence', 'As a matter of fact', 'As can be seen', 'As far as', 'As soon as', 'At first', 
    'At last', 'At length', 'At this point', 'Be that as it may', 'By and large', 'By the same token', 'Even though', 
    'For fear that', 'For that reason', 'For the most part', 'Granted', 'Henceforth', 'If by chance', 'If so', 'In a moment', 
    'In any case', 'In any event', 'In light of', 'In order to', 'In particular', 'In reality', 'In short', 'In spite of', 
    'In view of', 'It follows that', 'Least of all', 'Most importantly', 'Needless to say', 'Of course', 'On the whole', 
    'One example is', 'One reason is', 'Or', 'Over time', 'Prior to', 'Provided that', 'Seeing that', 'So as to', 'Sooner or later', 
    'Such as', 'That being said', 'The next step', 'Thereafter', 'Thereby', 'Thirdly', 'Through', 'Till', 'To be sure', 
    'To begin with', 'To illustrate', 'To reiterate', 'To the end that', 'To this end', 'Until now', 'Up to now', 'What is more', 
    'Without a doubt', 'Without delay', 'Without exception', 'Yet again'
]

def find_transitional_phrases(text):
    return [phrase for phrase in transitional_phrases if phrase.lower() in text.lower()]

def preprocessed_text_part1(text):
    text = removeHTML(text)
    text = re.sub("@\w+", '', text)
    text = re.sub(r"\b\d+(?:'s?)?\b", '', text)
    text = re.sub("http\w+", '', text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\.+", ".", text)
    text = re.sub(r"\,+", ",", text)
    text = replace_punctuation_with_space_if_absent(text)
    text = replace_contractions(text, contractions)
    text = text.strip()
    return text

def preprocessed_text(text):
    text = remove_trailing_s(text)
    text = remove_special_characters_and_punctuation(text)
    text = replace_multiple_spaces(text)
    text = remove_words_with_numbers(text)
    text = text.strip()
    return text

def clean_text_from_emojis_and_non_ascii(text):
    text = re.sub(r'[^\w\s,.\n]', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    return re.sub(r'\s+', ' ', text).replace('\n ', '\n')

def find_misspelled_words(text):
    spell = SpellChecker()
    words = word_tokenize(text)
    misspelled_words = spell.unknown(words)
    return [word for word in misspelled_words if re.match(r'\w+', word)]

def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# 4. Transform data

# Preprocessed Text
print("Working on Preprocessed Text Features")
start_time = time.time()

df['preprocessed_text_part1'] = Parallel(n_jobs=num_cores)(delayed(preprocessed_text_part1)(row) for row in df['full_text'])
df['preprocessed_text'] = Parallel(n_jobs=num_cores)(delayed(preprocessed_text)(row) for row in df['preprocessed_text_part1'])
df['lemmatized_preprocessed_text'] = df['preprocessed_text'].apply(lemmatize_text)
df['clean_lemm_preprocessed_text'] = df['lemmatized_preprocessed_text'].apply(remove_stop_words)
df['full_text_without_non_ascii'] = Parallel(n_jobs=num_cores)(delayed(clean_text_from_emojis_and_non_ascii)(row) for row in df['full_text'])

end_time = time.time()
print(f"Elapsed time: {end_time - start_time} seconds")

# Misspelled Words
print("Working on Misspelled Features")
start_time = time.time()

df['misspelled_words_spell_checker'] = Parallel(n_jobs=num_cores)(delayed(find_misspelled_words)(row) for row in df['preprocessed_text'])

end_time = time.time()
print(f"Elapsed time: {end_time - start_time} seconds")

# 5. Export Data

# Export the DataFrame to a CSV file
print("Exporting file")

df.to_csv('transformed_data_exp_2.csv', index=False)
print("File exported")

Number of available CPU cores: 32
Working on Preprocessed Text Features
Elapsed time: 351.0896408557892 seconds
Working on Misspelled Features
Elapsed time: 160.21734404563904 seconds
Exporting file
File exported
