# Table of Content
1. [Text Cleaning](#textcleaning)
2. [Text Preprocessing](#textpreprocessing)

In [1]:
%time
import os
import sys
import time
import random
import string
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import numpy as np
import pandas as pd
import sklearn

# libraries for text cleaning
import contractions
from bs4 import BeautifulSoup
from textblob import TextBlob
from spellchecker import SpellChecker

# libraries and packages for text (pre-)processing 
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import pos_tag_sents
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs


In [2]:
train_df = pd.read_csv("Data/train.csv")
print(train_df.shape)
train_df.head()

(159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


<a id="textcleaning"></a>
# 1. Text Cleaning

## Convert to Lower Case

We convert all letters to lower case to prepare for the following steps of text cleaning. Exceptional cases such as capital abbreviation will be solved by replacing typos, slang, acronyms or informal abbreviations technique in the subsquent steps.

In [3]:
train_df["clean_text"] = train_df["comment_text"].apply(lambda x: x.lower())
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"""\nmore\ni can't make any real suggestions on ..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


## Expand Contractions


Contractions are words or combinations of words that are shortened by dropping letters and replacing them by an apostrophe. Removing contractions helps contribute to text standardization. We use contractions package to expand contractions.

In [4]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: contractions.fix(x))

In [5]:
# check if expand contractions works
print("Original text: \n", train_df["comment_text"][2])
print("Clean text: \n", train_df["clean_text"][2])

Original text: 
 Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
Clean text: 
 hey man, i am really not trying to edit war. it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.


## Remove Noise

Remove unnecessary characters or punctuation such as URLs, HTML tags, non-ASCII characters, or other special characters 

### Remove URL

In [6]:
# replace URL with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))

###  Remove Non-ASCI Characters

In [7]:
# replace Non_ASCI characters with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'[^\x00-\x7f]', ' ', x))

###  Remove Special Characters

In [8]:
regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u"\ufe0f"  # dingbats
        "]+", flags = re.UNICODE)

In [9]:
# replace special characters with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: regrex_pattern.sub(' ', x))

In [10]:
# check if special characters are removed
print("Original text: \n", train_df["comment_text"][143])
print("Clean text: \n", train_df["clean_text"][143])

Original text: 
 "P.S. It's not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You're right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   (☎☓) 

"
Clean text: 
 "p.s. it is not polite to talk to people behind their backs, please remove your comments from mrph's talk page.

vaughan
you are right; i went to check your previous edit and found a page on the marvel site that spelled it ""vaughn"", but now i am finding many more that spell it correctly. thanks for the edits.   (  ) 

"


### Remove HTML Tag (BeautifulSoup not really useful? merely remove space?)

In [11]:
cleaned_text = train_df["clean_text"].apply(lambda x: BeautifulSoup(str(x)).get_text())

In [12]:
text_changed = cleaned_text!=train_df["clean_text"]

In [13]:
[i for i, x in enumerate(text_changed) if x][:10]

[228, 329, 3303, 3699, 3858, 4112, 4929, 5547, 5837, 6193]

In [14]:
train_df["clean_text"][228]

'   heritage from village           in macedonian          . sources claim that the village was pure slavic.'

In [15]:
cleaned_text[228]

'heritage from village           in macedonian          . sources claim that the village was pure slavic.'

In [16]:
# replace HTML tag with space
html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(html, " ", x))

###  Remove Extra Space

In [17]:
# replace \r\n with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub('\r\n', ' ', x))

In [18]:
# remove extra space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(' +', ' ', x))

## Replace Common Slangs

Slang, acronyms or informal abbreviations should be replaced with formal English. The list of common slangs used in Tweets takes reference from https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing.

In [19]:
# read abbreviation.csv
abbreviations = pd.read_csv('Data/abbreviations.csv')
abbreviations.head()

Unnamed: 0,abbreviation,translation
0,$,dollar
1,€,euro
2,4ao,for adults only
3,a.m,before midday
4,a3,anytime anywhere anyplace


In [20]:
# convert the data frame to a dictionary
abbreviations_dict = dict(zip(abbreviations.abbreviation, abbreviations.translation))

In [21]:
# define a helper function to replace the abbreviations
def convert_abbrev(text):
    # create a pattern of all abbreviations and make sure they are not part of a longer word
    abbreviations_pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in abbreviations_dict.keys()) + r')(?!\w)')
    # replace an abbreviation with its translation
    text = abbreviations_pattern.sub(lambda x: abbreviations_dict[x.group()], text)
    return text

In [22]:
# replace the slangs
train_df["clean_text"] = train_df["clean_text"].apply(convert_abbrev)

In [23]:
# check if slangs are replaced
print("Original text: \n", train_df["comment_text"][1])
print("Clean text: \n", train_df["clean_text"][1])

Original text: 
 D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
Clean text: 
 d'aww! he matches this background colour i am seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (coordinated universal time)


## Spelling Correction

We should correct the misspellings in the text. Both SpellChecker and TextBlob provide such functions, and we would like to compare their performance.

In [24]:
# select random texts from clean_text
length = len(train_df["clean_text"])
random_num = random.sample(range(length), 100)
random_text = train_df["clean_text"][random_num]

In [25]:
# using TextBlob package
start_time1 = time.time()
random_text.apply(lambda x: TextBlob(x).correct())
print("--- %s seconds ---" % (time.time() - start_time1))

--- 50.41714692115784 seconds ---


In [26]:
# using SpellChecker package
start_time2 = time.time()
random_text.apply(lambda x: SpellChecker().correction(x))
print("--- %s seconds ---" % (time.time() - start_time2))

--- 15.755198955535889 seconds ---


Randomly select 100 texts and apply spelling correction functions on them. Comparing the execution time of 2 different packages, SpellChecker is much faster than TextBlob. Considering we are using a large-scale dataset, SpellChecker is preferred.

In [24]:
def correct_spelling(text):
    start_time = time.time()
    cleaned_text = []
    spellchecker = SpellChecker()
    for i in range(text.shape[0]):
        if i%100==0:
            print(f'{i}-th text is being processed')
        cleaned_text.append(spellchecker.correction(text[i]))
    print("--- %s seconds ---" % (time.time() - start_time))
    return cleaned_text

In [25]:
cleaned_text = correct_spelling(train_df["clean_text"][:1001])

0-th text is being processed
100-th text is being processed
200-th text is being processed
300-th text is being processed
400-th text is being processed
500-th text is being processed
600-th text is being processed
700-th text is being processed
800-th text is being processed
900-th text is being processed
1000-th text is being processed
--- 103.30804991722107 seconds ---


In [26]:
train_df["clean_text"][:1001].index[cleaned_text!=train_df["clean_text"][:1001]]

Int64Index([ 62,  89, 101, 173, 175, 211, 217, 223, 226, 241, 250, 254, 259,
            268, 276, 299, 320, 323, 376, 381, 397, 408, 423, 448, 465, 470,
            504, 545, 592, 627, 632, 646, 715, 743, 758, 787, 806, 807, 814,
            823, 831, 844, 852, 874, 877, 883, 897, 899, 913, 923, 947, 971],
           dtype='int64')

In [27]:
train_df["clean_text"][971]

'. fu ck ing trollreasons'

In [28]:
cleaned_text[971]

However, many corrections do not make sense, and may omit some useful information. We decided not to use established package to perform spelling correction.

##########  for POS Tagging use  ##########

In [29]:
train_df["for_tagging_use"] = train_df["clean_text"]

## Remove Punctuations

We remove punctuations from the text as the final step of text cleaning.

In [30]:
# remove punctuations
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [31]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,for_tagging_use
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour i am se...,d'aww! he matches this background colour i am ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i am really not trying to edit war it ...,"hey man, i am really not trying to edit war. i..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on ...,"""\nmore\ni cannot make any real suggestions on..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"you, sir, are my hero. any chance you remember..."


In [32]:
train_df.drop('comment_text', axis=1).to_csv('Data/cleaned_train.csv', index=False)

<a id="textpreprocessing"></a>
# 2. Text Preprocessing

In [33]:
#cleaned_df = pd.read_csv("/cleaned_train.csv")
cleaned_df = pd.read_csv("Data/cleaned_train.csv")
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,for_tagging_use
0,0000997932d777bf,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,0,0,0,0,0,0,daww he matches this background colour i am se...,d'aww! he matches this background colour i am ...
2,000113f07ec002fd,0,0,0,0,0,0,hey man i am really not trying to edit war it ...,"hey man, i am really not trying to edit war. i..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on ...,"""\nmore\ni cannot make any real suggestions on..."
4,0001d958c54c6e35,0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"you, sir, are my hero. any chance you remember..."


## Tokenization (NLTK has a package specially catered to tokenizing tweets)

In [34]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
cleaned_df['text_tokenized'] = cleaned_df['clean_text'].apply(tt.tokenize)

In [35]:
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,for_tagging_use,text_tokenized
0,0000997932d777bf,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation\nwhy the edits made under my usern...,"[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,0,0,0,0,0,0,daww he matches this background colour i am se...,d'aww! he matches this background colour i am ...,"[daww, he, matches, this, background, colour, ..."
2,000113f07ec002fd,0,0,0,0,0,0,hey man i am really not trying to edit war it ...,"hey man, i am really not trying to edit war. i...","[hey, man, i, am, really, not, trying, to, edi..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on ...,"""\nmore\ni cannot make any real suggestions on...","[more, i, cannot, make, any, real, suggestions..."
4,0001d958c54c6e35,0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"you, sir, are my hero. any chance you remember...","[you, sir, are, my, hero, any, chance, you, re..."


## Remove Stop Words (or/and Frequent words/ Rare words)

### Stop Words Removal

In [36]:
from nltk.corpus import stopwords

#review all stop words in the library
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [38]:
#remove stop words from tokenized text
stop_words = stopwords.words('english')

def remove_stopwords(row):
    # check in lowercase 
    t = [token for token in row['text_tokenized'] if token.lower() not in stop_words]
    text = ' '.join(t)    
    return pd.Series([text,t])    #for stemming

cleaned_df[['text_no_stop_words','for_stemming_use']] = cleaned_df.apply(remove_stopwords, axis = 1)


cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,for_tagging_use,text_tokenized,text_no_stop_words,for_stemming_use
0,0000997932d777bf,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation\nwhy the edits made under my usern...,"[explanation, why, the, edits, made, under, my...",explanation edits made username hardcore metal...,"[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,0,0,0,0,0,0,daww he matches this background colour i am se...,d'aww! he matches this background colour i am ...,"[daww, he, matches, this, background, colour, ...",daww matches background colour seemingly stuck...,"[daww, matches, background, colour, seemingly,..."
2,000113f07ec002fd,0,0,0,0,0,0,hey man i am really not trying to edit war it ...,"hey man, i am really not trying to edit war. i...","[hey, man, i, am, really, not, trying, to, edi...",hey man really trying edit war guy constantly ...,"[hey, man, really, trying, edit, war, guy, con..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on ...,"""\nmore\ni cannot make any real suggestions on...","[more, i, cannot, make, any, real, suggestions...",cannot make real suggestions improvement wonde...,"[cannot, make, real, suggestions, improvement,..."
4,0001d958c54c6e35,0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"you, sir, are my hero. any chance you remember...","[you, sir, are, my, hero, any, chance, you, re...",sir hero chance remember page,"[sir, hero, chance, remember, page]"


In [39]:
#drop clean_text and tokenized column
del cleaned_df['clean_text']
del cleaned_df['text_tokenized']
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,for_tagging_use,text_no_stop_words,for_stemming_use
0,0000997932d777bf,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation edits made username hardcore metal...,"[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,daww matches background colour seemingly stuck...,"[daww, matches, background, colour, seemingly,..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...",hey man really trying edit war guy constantly ...,"[hey, man, really, trying, edit, war, guy, con..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,"""\nmore\ni cannot make any real suggestions on...",cannot make real suggestions improvement wonde...,"[cannot, make, real, suggestions, improvement,..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...",sir hero chance remember page,"[sir, hero, chance, remember, page]"


### Frequent Words Identification

In [88]:
#identify frequent words
from nltk.probability import FreqDist
combined_text_no_stop_words = ' '.join(cleaned_df['text_no_stop_words'])

def most_frequent_words(text):
    words = word_tokenize(text)
    fdist = FreqDist(words) 
    
    df_fdist = pd.DataFrame({'Word': fdist.keys(),
                             'Frequency': fdist.values()})
    df_fdist = df_fdist.sort_values(by='Frequency', ascending=False)
    
    return df_fdist.head(50)

most_frequent_word_df = most_frequent_words(combined_text_no_stop_words)
most_frequent_word_list = list(most_frequent_words(combined_text_no_stop_words)['Word'])

print(most_frequent_word_df)

             Word  Frequency
113       article      55457
20           page      45652
124         would      36217
159     wikipedia      35679
19           talk      31768
16         please      29617
129           one      28098
296          like      27708
38           time      22426
249           see      21579
163          also      20550
70          think      20039
85           know      19171
841        people      17794
43           edit      17597
88       articles      17524
104           use      16326
72            may      15559
31         thanks      13902
422          even      13393
472           get      13384
59           make      12943
379         could      12856
270          good      12743
49    information      12407
1041          way      12090
83           want      11913
409         point      11900
103          well      11447
588       sources      11264
1343         name      10998
175         pages      10998
198      deletion      10860
80          fi

## Stemming

We will use Snowball Stemmer to realise stemming. Comparing with Poster Stemmer, it is more efficient and has higher performance. Comparing with Lancaster Stemmer, it is less aggressive and can keep more word meanings for the Semantic Analysis in our later stage. 

In [40]:
stemmer = SnowballStemmer(language = 'english')
def stem_list_of_words(words):
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
    stemmed_sent = ' '.join(stemmed_words)  
    return stemmed_sent
        

cleaned_df['stemmed'] = cleaned_df['for_stemming_use'].apply(stem_list_of_words)
del cleaned_df['for_stemming_use']
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,for_tagging_use,text_no_stop_words,stemmed
0,0000997932d777bf,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation edits made username hardcore metal...,explan edit made usernam hardcor metallica fan...
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,daww matches background colour seemingly stuck...,daww match background colour seem stuck thank ...
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...",hey man really trying edit war guy constantly ...,hey man realli tri edit war guy constant remov...
3,0001b41b1c6bb37e,0,0,0,0,0,0,"""\nmore\ni cannot make any real suggestions on...",cannot make real suggestions improvement wonde...,cannot make real suggest improv wonder section...
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...",sir hero chance remember page,sir hero chanc rememb page


## Part of Speech Tagging (POS Tagging)

In [41]:
def to_word_tokens(sent_tokens):
    word_tokens = [] 
    for sent_token in sent_tokens:
        word_tokens.append(tt.tokenize(sent_token))
    return word_tokens

In [42]:
cleaned_df['for_tagging_use_sent_token'] = cleaned_df['for_tagging_use'].apply(sent_tokenize)
cleaned_df['for_tagging_use_word_token'] = cleaned_df['for_tagging_use_sent_token'].apply(to_word_tokens)
cleaned_df['POS_tagging'] = cleaned_df['for_tagging_use_word_token'].apply(pos_tag_sents)
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,for_tagging_use,text_no_stop_words,stemmed,for_tagging_use_sent_token,for_tagging_use_word_token,POS_tagging
0,0000997932d777bf,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation edits made username hardcore metal...,explan edit made usernam hardcor metallica fan...,[explanation\nwhy the edits made under my user...,"[[explanation, why, the, edits, made, under, m...","[[(explanation, NN), (why, WRB), (the, DT), (e..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,daww matches background colour seemingly stuck...,daww match background colour seem stuck thank ...,"[d'aww!, he matches this background colour i a...","[[d'aww, !], [he, matches, this, background, c...","[[(d'aww, NN), (!, .)], [(he, PRP), (matches, ..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...",hey man really trying edit war guy constantly ...,hey man realli tri edit war guy constant remov...,"[hey man, i am really not trying to edit war.,...","[[hey, man, ,, i, am, really, not, trying, to,...","[[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, ..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,"""\nmore\ni cannot make any real suggestions on...",cannot make real suggestions improvement wonde...,cannot make real suggest improv wonder section...,"[""\nmore\ni cannot make any real suggestions o...","[["", more, i, cannot, make, any, real, suggest...","[[("", IN), (more, JJR), (i, JJ), (cannot, NNS)..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...",sir hero chance remember page,sir hero chanc rememb page,"[you, sir, are my hero., any chance you rememb...","[[you, ,, sir, ,, are, my, hero, .], [any, cha...","[[(you, PRP), (,, ,), (sir, VB), (,, ,), (are,..."


In [43]:
cleaned_df = cleaned_df.drop(columns=['for_tagging_use_sent_token','for_tagging_use','for_tagging_use_word_token'])
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,text_no_stop_words,stemmed,POS_tagging
0,0000997932d777bf,0,0,0,0,0,0,explanation edits made username hardcore metal...,explan edit made usernam hardcor metallica fan...,"[[(explanation, NN), (why, WRB), (the, DT), (e..."
1,000103f0d9cfb60f,0,0,0,0,0,0,daww matches background colour seemingly stuck...,daww match background colour seem stuck thank ...,"[[(d'aww, NN), (!, .)], [(he, PRP), (matches, ..."
2,000113f07ec002fd,0,0,0,0,0,0,hey man really trying edit war guy constantly ...,hey man realli tri edit war guy constant remov...,"[[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, ..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,cannot make real suggestions improvement wonde...,cannot make real suggest improv wonder section...,"[[("", IN), (more, JJR), (i, JJ), (cannot, NNS)..."
4,0001d958c54c6e35,0,0,0,0,0,0,sir hero chance remember page,sir hero chanc rememb page,"[[(you, PRP), (,, ,), (sir, VB), (,, ,), (are,..."


## Lemmatization

In [36]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# stop = set(stopwords.words('english'))

In [37]:
lemmatizer = WordNetLemmatizer()
print("before",cleaned_df.iloc[0]['clean_text'])
lm = [lemmatizer.lemmatize(t) for t in word_tokenize(cleaned_df.iloc[0]['clean_text'])]
print("after"," ".join(lm))

before explanation
why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now892053827


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/geraldinesoo/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
def process(sentence, stop):
    wordsList = nltk.word_tokenize(sentence)
 
    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop]
 
    #  Using a Tagger. Which is part-of-speech
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)
    pos_tagged_text = [ word for word, tag in tagged if tag.startswith('JJ') or tag.startswith('NN') or tag.startswith('RB') or tag.startswith('VB')]
    return pos_tagged_text 

In [None]:
cleaned_df['lemmatization'] = cleaned_df['clean_text'].apply(lambda x: " ".join([lemmatizer.lemmatize(t) for t in process(x, stop)]))
#cleaned_df['lemmatization_w_pos'] = cleaned_df['taged_words'].apply(lambda x: " ".join([lemmatizer.lemmatize(t) for t in word_tokenize(x)]))

In [None]:
print("before",cleaned_df.iloc[5]['clean_text'])
print("after",cleaned_df.iloc[5]['lemmatization'])

## Weighted Words - Bag of Words (BoW) - Bag of n-grams

### Frequency Vector

In [None]:
def bagging(ngram,cleaned_df):
  count_vec = CountVectorizer(ngram_range = (ngram, ngram))
  weight_words = count_vec.fit_transform(cleaned_df['lemmatization'])
  return pd.DataFrame(weight_words.toarray().transpose(), index=count_vec.get_feature_names_out())


In [None]:
uni_gram = bagging(1, cleaned_df)
uni_gram.to_csv("/BOW_Unigram.csv")
bi_gram = bagging(2, cleaned_df)
bi_gram.to_csv("/BOW_Bigram.csv")
tri_gram = bagging(3, cleaned_df)
tri_gram.to_csv("/BOW_Trigram.csv")

### Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
def TF_IDF(ngram, cleaned_df):
    tfidf = TfidfVectorizer(ngram_range = (ngram, ngram))
    weight_words = tfidf.fit_transform(cleaned_df['lemmatization'])
    return pd.DataFrame(weight_words.toarray().transpose(), index=tfidf.get_feature_names_out())

In [None]:
uni_gram = TF_IDF(1, cleaned_df)
# uni_gram.to_csv("/TF_IDF_Unigram.csv")
# bi_gram = TF_IDF(2, cleaned_df)
# bi_gram.to_csv("/TF_IDF_Bigram.csv")
# tri_gram = TF_IDF(3, cleaned_df)
# tri_gram.to_csv("/TF_IDF_Trigram.csv")