Imports

In [70]:
import pandas as pd
import numpy as np

from textblob import TextBlob
from nltk.tokenize import WordPunctTokenizer, word_tokenize, StanfordSegmenter, sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords

import re, os, sys, string, itertools
from collections import defaultdict, Counter
import unicodedata
from tqdm import tqdm, tqdm_notebook
import sentencepiece as spm


Function to replace words which are written in quotes with quoted word

In [71]:
def remove_quotes(series):
    """
    Function removes quotes from words or phrases written in double quotes.
    """
    # Define a regex pattern to match words or phrases in quotes
    regex = re.compile(r'"([\w\s]+)"')
    
    # Count the number of occurrences of quoted words/phrases
    total_occurrences = series.str.count(regex).sum()
    print(f"Total occurrences of quoted words: {total_occurrences}")
    
    # Remove the quotes but keep the words inside
    series = series.str.replace(regex, r'\1', regex=True)
    
    return series

Removal of IP Address found in text

In [72]:
def remove_ips(series):
    """
    Removing Ip Addresses
    """
    series = series.copy()
    regex = re.compile(r'(([0-9]{1,}\.){2,}[0-9]{1,})')
    print("Total unique ip address in data are {}".format(series.str.extract(regex).nunique()))
    series = series.str.replace(regex, ' ', regex = True)
    return series


In [73]:
def remove_trailing_dates(series):
    series = series.copy()
    return series.str.replace("([0-9]{1,2}:[0-9]{1,2},{0,1}\s[0-9]{1,2}\s[a-zA-Z]{3,}\s[0-9]{2,4}\s\((utc|UTC)\))", " ", regex = True)

In [74]:
def trim_repetitions(series, thresh=5):
    """
    If comment has repetitions, if more than repetition then trim at 10 words
    """
    series = series.copy()
    total_words = series.str.count("\w+")
    unique_words = series.apply(lambda x:len(np.unique(x.split(' '))))

    rep_inds = total_words/unique_words > thresh
    print("Total comments with high repetitions are {}".format(sum(rep_inds)))
    print("Some examples of high reps are {}".format(series.loc[rep_inds].sample(5).values))

    rep_inds_unq = iter(unique_words.loc[rep_inds])
    series.loc[rep_inds] = series[rep_inds].str.split(' ').str.slice(0, next(rep_inds_unq)).str.join(' ')

    print("Some samples are {}".format(series.loc[rep_inds].sample(5).values))
    return series

In [75]:
def break_oovocabwords(series, vocab_filename, sp_file):
    """
    Break OOV words using SentencePiece and preprocess the text.
    """
    series = series.copy()

    # Load the vocabulary
    with open(vocab_filename, encoding='utf-8') as f:
        dict_word = set([o.rstrip().rsplit(' ')[0] for o in f])

    # Load SentencePiece model
    sp = spm.SentencePieceProcessor()
    sp.Load(sp_file)

    def standardize_repeated_chars(word):
        """
        Replace 3+ repeated characters with 2 (e.g., 'wayyyyy' -> 'wayy').
        """
        return re.sub(r'(.)\1{2,}', r'\1\1', word)

    # Preprocess each word in the series
    series = series.apply(lambda x: ' '.join(itertools.chain.from_iterable(
        [sp.EncodeAsPieces(standardize_repeated_chars(word)) if word not in dict_word else [word] for word in x.split(' ')]
    )))

    # Remove SentencePiece artifacts
    series = series.str.replace("â–", "", regex=False)

    return series


In [76]:
def remove_url(series):
    series = series.copy()
    regex = re.compile(r'http[s]?://\S+')
    series = series.str.replace(regex, ' ', regex = True)
    return series


In [77]:
def preprocess_text(series, remove_ip=True, remove_date_stamps=True, tag_quoted=True, remove_puncts=True, lower=True,
                    remove_digits=True, remove_nonchars=True,
                   break_oov=True, break_vocab_file="", break_sp_file="", trim_reps=True):
    series = series.copy()
    series = series.str.replace(r"\\n{1,}", " line ")

    if remove_url:
        series = remove_url(series)
    
    if remove_ip:
        series = remove_ips(series)
        
    if remove_date_stamps:
        series = remove_trailing_dates(series)
        
    if tag_quoted:
        series = remove_quotes(series)
        
    if remove_puncts:
        series = series.str.replace("'", "")
        series = series.str.translate(str.maketrans({s:" " for s in string.punctuation}))
        
    if lower:
        series = series.str.lower()
        
    if remove_digits:
        series = series.str.replace(r"\d", "", regex = True )
    
    if remove_nonchars:
        series = series.str.replace(r"[^a-zA-Z0-9.,\"!]+", " ", regex = True)
        
    if break_oov:
        series = break_oovocabwords(series, break_vocab_file, break_sp_file)
        
    if trim_reps:
        series = trim_repetitions(series, thresh=10)
        
    return series

In [78]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [79]:
embed_file ="glove.42B.300d.txt"
sp_file = "en.wiki.bpe.vs200000.model"

train.comment_text = preprocess_text(train.comment_text, break_vocab_file=embed_file, break_sp_file=sp_file)
test.comment_text = preprocess_text(test.comment_text, break_vocab_file=embed_file, break_sp_file=sp_file)
train.comment_text.sample(10).values

Total unique ip address in data are 0    5507
1     274
dtype: int64
Total occurrences of quoted words: 63828
Total comments with high repetitions are 378
Some examples of high reps are ['user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not wanted here stop stalking user user precious roy go away and bother someone else you are not w

array(['well my friend i wish you good luck in all your endeavors',
       'a barnstar for you the good article ▁barn star for your contributions to bring voivode of transylvania to good article status thanks and keep up the good work',
       'deleting it from this article was quite appropriate it has kept a creeping list that would have certainly sprawled over this entire article from stifling the rest of its content',
       'talk p s why have you copied my username style exactly',
       'electronic music article disambiguation reversions please stop if you continue to vandalize wikipedia you will be blocked from editing parsifal please stop reverting edits that are in consensus with months of comments and discussion on the subject please calm down take a break',
       'first off it is not vandalism the number changed second be careful how you speak to me',
       'youve got me there reverting biased views about two hms i see sounds like im the one who is biased christ take a brea

In [80]:
def remove_encoding_artifacts(series):
    """
    Removes multiple encoding artifacts from the text.
    """
    # Add patterns for all known artifacts
    pattern = r'â–|â–|â€œ|â€|â€¦'
    
    # Replace all matches with an empty string
    series = series.str.replace(pattern, '', regex=True)
    return series

train.comment_text = remove_encoding_artifacts(train.comment_text)
test.comment_text = remove_encoding_artifacts(test.comment_text)

In [None]:
train.to_csv("train_preprocess.csv", index=False)
test.to_csv("test_preprocess.csv", index=False)