In [1]:
import pandas as pd
import string
import contractions
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
with open("./for_preprocessing.csv") as f:
    data = pd.read_csv(f)
f.close()

In [3]:
data.head()

Unnamed: 0,id,title,url,author,score,publish_date,num_comment,permalink,flair,selftext
0,abd9dz,"Allow banks to hold passports of loan-takers, ...",https://www.reddit.com/r/india/comments/abd9dz...,askquestionsdude,1,2019-01-01 00:45:17,0,/r/india/comments/abd9dz/allow_banks_to_hold_p...,Business/Finance,[removed]
1,abde0g,Tamil Nadu to usher in New Year on green note ...,https://www.livemint.com/Politics/95rPBVTWmQqM...,askquestionsdude,1,2019-01-01 01:00:15,3,/r/india/comments/abde0g/tamil_nadu_to_usher_i...,Policy/Economy,
2,abe9wz,"Worst of the NPA crisis is over, says RBI report",https://www.livemint.com/Industry/LohId3yWEeQ1...,harddisc,1,2019-01-01 02:56:12,0,/r/india/comments/abe9wz/worst_of_the_npa_cris...,Policy/Economy,
3,aber36,Ravi Shastri's comments about O'Keefe,https://www.reddit.com/r/india/comments/aber36...,trowaaaay,1,2019-01-01 04:03:40,3,/r/india/comments/aber36/ravi_shastris_comment...,Sports,"Kerry O' Keefe joked that Mayank Agarwal ""appa..."
4,abervj,A picture I clicked at the pillar rocks in Kod...,https://i.redd.it/s2gh1xg4kq721.jpg,Daiguren_Hyorinmaru_,1,2019-01-01 04:07:10,52,/r/india/comments/abervj/a_picture_i_clicked_a...,Photography,


### Merging Title and Self-Text as one
As the self-text is the extendent description of the title, the self-text and the tiles are mereged together

In [4]:
data['selftext'] = data['selftext'].astype(str)
empties = ['nan', '[deleted]', '[removed]']
for empty in empties:
  print(empty, len(data[data['selftext'] == empty]))

nan 15673
[deleted] 55
[removed] 2019


replacting nan, [deleted], [removed] and replacing with empty character ""

In [5]:
data['selftext'] = data['selftext'].apply(lambda x: "" if x in empties else x)
for empty in empties:
  print(empty, len(data[data['selftext'] == empty]))

nan 0
[deleted] 0
[removed] 0


Below we merge the title and the selftext parts and drop the unnecessary columns

In [6]:
data['title'] = data['title'] + " " + data['selftext']
data = data.drop(['id', 'url', 'author', 'score', 'publish_date', 'num_comment', 'permalink', 'selftext'], axis = 1)
data.head()

Unnamed: 0,title,flair
0,"Allow banks to hold passports of loan-takers, ...",Business/Finance
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy
2,"Worst of the NPA crisis is over, says RBI report",Policy/Economy
3,Ravi Shastri's comments about O'Keefe Kerry O'...,Sports
4,A picture I clicked at the pillar rocks in Kod...,Photography


## Removing URLs

In [7]:
data['title'] = data['title'].apply(lambda x : re.compile(r'https?:\/\/\S+').sub(" ", x) )

## Removing Punctuations
Removing all the symbols and punctuation and multiple-continuous puntuations like "!!!", "...", etc.


In [8]:
data['title'] = data['title'].apply(lambda x : re.sub(r"[^a-zA-Z]+", r" ", x) )

## Expanding Contractions
Contractions are those little literary shortcuts we take where instead of “Should have” we prefer “Should’ve” or where “Do not” quickly becomes “Don’t”. We are going to add a new column to our dataframe called “no_contract” and apply a lambda function to the “rating_description” field which will expand any contractions. Be aware of the fact the expanded contractions will be effectively tokenized together. In other words, “I’ve” = “I have” instead of “I”, “have”.

We ultimately would want the expanded contractions to be tokenized separately into “I”, “have”, therefore, let’s convert the lists under the “no_contract” column back into strings.

In [9]:
data['no_contract'] = data['title'].apply(lambda x :[contractions.fix(word) for word in x.split()])

In [10]:
data

Unnamed: 0,title,flair,no_contract
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ..."
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock..."
...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma..."
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor..."


In [11]:
data['title_description'] = [" ".join(map(str, l)) for l in data['no_contract']]

In [12]:
data

Unnamed: 0,title,flair,no_contract,title_description
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...
...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...


## Tokenization
Now that we have converted the lists back into strings we can apply proper tokenization in order to split each individual word into a token. We will apply NLTK.word_tokenize() function to the “rating_description_list” column and create a new column named “tokenized”.

In [13]:
data['tokenized'] = data['title_description'].apply(word_tokenize)
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ..."
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock..."
...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma..."
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor..."


## Converting all Characters to Lowercase
Transforming all words to lowercase is also a very common pre-processing step. In this case, we will once again append a new column named “lower” to the dataframe which will transform all the tokenized words into lowercase. However, because we have to iterate over multiple words we will use a simple for-loop within a lambda function to apply the “lower” function to each word.

In [14]:
data['lower'] = data['tokenized'].apply(lambda x: [word.lower() for word in x])
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized,lower
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ...","[allow, banks, to, hold, passports, of, loan, ..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...","[tamil, nadu, to, usher, in, new, year, on, gr..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ...","[worst, of, the, npa, crisis, is, over, says, ..."
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ...","[ravi, shastri, s, comments, about, o, keefe, ..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock...","[a, picture, i, clicked, at, the, pillar, rock..."
...,...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma...","[here, s, my, side, of, the, story, as, a, sma..."
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R...","[indian, astronauts, training, on, hold, in, r..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor...","[covid, government, threatens, kashmir, doctor..."


## Removing Stopwords
Stopwords are typically useless words and do not add much meaning to a sentence. In the English language common stopwords include “you, he, she, in, a, has, are, etc.”. First, we need to import the NLTK stopwords library and set our stopwords to “english”. We are going to add a new column “no_stopwords” which will remove the stopwords from the “no_punc” column since it has been tokenized, had been converted to lowercase and punctuation was removed. Once again a for-loop within a lambda function will iterate over the tokens in “no_punc” and only return the tokens which do no exist in our “stop_words” variable.



In [15]:
stop_words = set(stopwords.words('english') + ['amp']) # added 'amp' to remove ampersands
data['stopwords_removed'] = data['lower'].apply(lambda x: [word for word in x if word not in stop_words])
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized,lower,stopwords_removed
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ...","[allow, banks, to, hold, passports, of, loan, ...","[allow, banks, hold, passports, loan, takers, ..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...","[tamil, nadu, to, usher, in, new, year, on, gr...","[tamil, nadu, usher, new, year, green, note, b..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ...","[worst, of, the, npa, crisis, is, over, says, ...","[worst, npa, crisis, says, rbi, report]"
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ...","[ravi, shastri, s, comments, about, o, keefe, ...","[ravi, shastri, comments, keefe, kerry, keefe,..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock...","[a, picture, i, clicked, at, the, pillar, rock...","[picture, clicked, pillar, rocks, kodaikanal, ..."
...,...,...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma...","[here, s, my, side, of, the, story, as, a, sma...","[side, story, small, businessman]"
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R...","[indian, astronauts, training, on, hold, in, r...","[indian, astronauts, training, hold, russia, d..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor...","[covid, government, threatens, kashmir, doctor...","[covid, government, threatens, kashmir, doctor..."


## Lemmatization
The idea of stemming is to reduce different forms of word usage into its root word. For example, “drive”, “drove”, “driving”, “driven”, “driver” are derivatives of the word “drive” and very often researchers want to remove this variability from their corpus. Compared to lemmatization, stemming is certainly the less complicated method but it often does not produce a dictionary-specific morphological root of the word. In other words, stemming the word “pies” will often produce a root of “pi” whereas lemmatization will find the morphological root of “pie”.

Instead of taking the easy way out with stemming, let’s apply lemmatization to our data as it requires some additional steps compared to stemming.

First, we have to apply parts of speech tags, in other words, determine the part of speech (ie. noun, verb, adverb, etc.) for each word.

In [16]:
data['pos_tags'] = data['stopwords_removed'].apply(nltk.tag.pos_tag)
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized,lower,stopwords_removed,pos_tags
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ...","[allow, banks, to, hold, passports, of, loan, ...","[allow, banks, hold, passports, loan, takers, ...","[(allow, JJ), (banks, NNS), (hold, VBP), (pass..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...","[tamil, nadu, to, usher, in, new, year, on, gr...","[tamil, nadu, usher, new, year, green, note, b...","[(tamil, NN), (nadu, MD), (usher, RB), (new, J..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ...","[worst, of, the, npa, crisis, is, over, says, ...","[worst, npa, crisis, says, rbi, report]","[(worst, JJS), (npa, NN), (crisis, NN), (says,..."
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ...","[ravi, shastri, s, comments, about, o, keefe, ...","[ravi, shastri, comments, keefe, kerry, keefe,...","[(ravi, NN), (shastri, NN), (comments, NNS), (..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock...","[a, picture, i, clicked, at, the, pillar, rock...","[picture, clicked, pillar, rocks, kodaikanal, ...","[(picture, NN), (clicked, VBD), (pillar, JJ), ..."
...,...,...,...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[(india, NN), (leaves, VBZ), (behind, IN), (lo..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma...","[here, s, my, side, of, the, story, as, a, sma...","[side, story, small, businessman]","[(side, JJ), (story, NN), (small, JJ), (busine..."
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R...","[indian, astronauts, training, on, hold, in, r...","[indian, astronauts, training, hold, russia, d...","[(indian, JJ), (astronauts, NNS), (training, V..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor...","[covid, government, threatens, kashmir, doctor...","[covid, government, threatens, kashmir, doctor...","[(covid, JJ), (government, NN), (threatens, VB..."


In [17]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [18]:
data['wordnet_pos'] = data['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ...","[allow, banks, to, hold, passports, of, loan, ...","[allow, banks, hold, passports, loan, takers, ...","[(allow, JJ), (banks, NNS), (hold, VBP), (pass...","[(allow, a), (banks, n), (hold, v), (passports..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...","[tamil, nadu, to, usher, in, new, year, on, gr...","[tamil, nadu, usher, new, year, green, note, b...","[(tamil, NN), (nadu, MD), (usher, RB), (new, J...","[(tamil, n), (nadu, n), (usher, r), (new, a), ..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ...","[worst, of, the, npa, crisis, is, over, says, ...","[worst, npa, crisis, says, rbi, report]","[(worst, JJS), (npa, NN), (crisis, NN), (says,...","[(worst, a), (npa, n), (crisis, n), (says, v),..."
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ...","[ravi, shastri, s, comments, about, o, keefe, ...","[ravi, shastri, comments, keefe, kerry, keefe,...","[(ravi, NN), (shastri, NN), (comments, NNS), (...","[(ravi, n), (shastri, n), (comments, n), (keef..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock...","[a, picture, i, clicked, at, the, pillar, rock...","[picture, clicked, pillar, rocks, kodaikanal, ...","[(picture, NN), (clicked, VBD), (pillar, JJ), ...","[(picture, n), (clicked, v), (pillar, a), (roc..."
...,...,...,...,...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[(india, NN), (leaves, VBZ), (behind, IN), (lo...","[(india, n), (leaves, v), (behind, n), (local,..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma...","[here, s, my, side, of, the, story, as, a, sma...","[side, story, small, businessman]","[(side, JJ), (story, NN), (small, JJ), (busine...","[(side, a), (story, n), (small, a), (businessm..."
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R...","[indian, astronauts, training, on, hold, in, r...","[indian, astronauts, training, hold, russia, d...","[(indian, JJ), (astronauts, NNS), (training, V...","[(indian, a), (astronauts, n), (training, v), ..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor...","[covid, government, threatens, kashmir, doctor...","[covid, government, threatens, kashmir, doctor...","[(covid, JJ), (government, NN), (threatens, VB...","[(covid, a), (government, n), (threatens, v), ..."


In [19]:
wnl  = WordNetLemmatizer()
data['lemmatized'] = data['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ...","[allow, banks, to, hold, passports, of, loan, ...","[allow, banks, hold, passports, loan, takers, ...","[(allow, JJ), (banks, NNS), (hold, VBP), (pass...","[(allow, a), (banks, n), (hold, v), (passports...","[allow, bank, hold, passport, loan, taker, mad..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...","[tamil, nadu, to, usher, in, new, year, on, gr...","[tamil, nadu, usher, new, year, green, note, b...","[(tamil, NN), (nadu, MD), (usher, RB), (new, J...","[(tamil, n), (nadu, n), (usher, r), (new, a), ...","[tamil, nadu, usher, new, year, green, note, b..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ...","[worst, of, the, npa, crisis, is, over, says, ...","[worst, npa, crisis, says, rbi, report]","[(worst, JJS), (npa, NN), (crisis, NN), (says,...","[(worst, a), (npa, n), (crisis, n), (says, v),...","[bad, npa, crisis, say, rbi, report]"
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ...","[ravi, shastri, s, comments, about, o, keefe, ...","[ravi, shastri, comments, keefe, kerry, keefe,...","[(ravi, NN), (shastri, NN), (comments, NNS), (...","[(ravi, n), (shastri, n), (comments, n), (keef...","[ravi, shastri, comment, keefe, kerry, keefe, ..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock...","[a, picture, i, clicked, at, the, pillar, rock...","[picture, clicked, pillar, rocks, kodaikanal, ...","[(picture, NN), (clicked, VBD), (pillar, JJ), ...","[(picture, n), (clicked, v), (pillar, a), (roc...","[picture, click, pillar, rock, kodaikanal, rec..."
...,...,...,...,...,...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[(india, NN), (leaves, VBZ), (behind, IN), (lo...","[(india, n), (leaves, v), (behind, n), (local,...","[india, leave, behind, local, transmission, ph..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma...","[here, s, my, side, of, the, story, as, a, sma...","[side, story, small, businessman]","[(side, JJ), (story, NN), (small, JJ), (busine...","[(side, a), (story, n), (small, a), (businessm...","[side, story, small, businessman]"
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R...","[indian, astronauts, training, on, hold, in, r...","[indian, astronauts, training, hold, russia, d...","[(indian, JJ), (astronauts, NNS), (training, V...","[(indian, a), (astronauts, n), (training, v), ...","[indian, astronaut, train, hold, russia, due, ..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor...","[covid, government, threatens, kashmir, doctor...","[covid, government, threatens, kashmir, doctor...","[(covid, JJ), (government, NN), (threatens, VB...","[(covid, a), (government, n), (threatens, v), ...","[covid, government, threaten, kashmir, doctor,..."


## Removing Small Words
Remove all words with length < 2 from the data, some redundant and unrecognized small words may not be in the stopword vocabulary.

In [20]:
data['small_words_rem'] = data['lemmatized'].apply(lambda x: [t for t in x if len(t) > 2])
data

Unnamed: 0,title,flair,no_contract,title_description,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,small_words_rem
0,Allow banks to hold passports of loan takers M...,Business/Finance,"[Allow, banks, to, hold, passports, of, loan, ...",Allow banks to hold passports of loan takers M...,"[Allow, banks, to, hold, passports, of, loan, ...","[allow, banks, to, hold, passports, of, loan, ...","[allow, banks, hold, passports, loan, takers, ...","[(allow, JJ), (banks, NNS), (hold, VBP), (pass...","[(allow, a), (banks, n), (hold, v), (passports...","[allow, bank, hold, passport, loan, taker, mad...","[allow, bank, hold, passport, loan, taker, mad..."
1,Tamil Nadu to usher in New Year on green note ...,Policy/Economy,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...",Tamil Nadu to usher in New Year on green note ...,"[Tamil, Nadu, to, usher, in, New, Year, on, gr...","[tamil, nadu, to, usher, in, new, year, on, gr...","[tamil, nadu, usher, new, year, green, note, b...","[(tamil, NN), (nadu, MD), (usher, RB), (new, J...","[(tamil, n), (nadu, n), (usher, r), (new, a), ...","[tamil, nadu, usher, new, year, green, note, b...","[tamil, nadu, usher, new, year, green, note, b..."
2,Worst of the NPA crisis is over says RBI report,Policy/Economy,"[Worst, of, the, NPA, crisis, is, over, says, ...",Worst of the NPA crisis is over says RBI report,"[Worst, of, the, NPA, crisis, is, over, says, ...","[worst, of, the, npa, crisis, is, over, says, ...","[worst, npa, crisis, says, rbi, report]","[(worst, JJS), (npa, NN), (crisis, NN), (says,...","[(worst, a), (npa, n), (crisis, n), (says, v),...","[bad, npa, crisis, say, rbi, report]","[bad, npa, crisis, say, rbi, report]"
3,Ravi Shastri s comments about O Keefe Kerry O ...,Sports,"[Ravi, Shastri, s, comments, about, O, Keefe, ...",Ravi Shastri s comments about O Keefe Kerry O ...,"[Ravi, Shastri, s, comments, about, O, Keefe, ...","[ravi, shastri, s, comments, about, o, keefe, ...","[ravi, shastri, comments, keefe, kerry, keefe,...","[(ravi, NN), (shastri, NN), (comments, NNS), (...","[(ravi, n), (shastri, n), (comments, n), (keef...","[ravi, shastri, comment, keefe, kerry, keefe, ...","[ravi, shastri, comment, keefe, kerry, keefe, ..."
4,A picture I clicked at the pillar rocks in Kod...,Photography,"[A, picture, I, clicked, at, the, pillar, rock...",A picture I clicked at the pillar rocks in Kod...,"[A, picture, I, clicked, at, the, pillar, rock...","[a, picture, i, clicked, at, the, pillar, rock...","[picture, clicked, pillar, rocks, kodaikanal, ...","[(picture, NN), (clicked, VBD), (pillar, JJ), ...","[(picture, n), (clicked, v), (pillar, a), (roc...","[picture, click, pillar, rock, kodaikanal, rec...","[picture, click, pillar, rock, kodaikanal, rec..."
...,...,...,...,...,...,...,...,...,...,...,...
20743,India leaves behind local transmission phase m...,Coronavirus,"[India, leaves, behind, local, transmission, p...",India leaves behind local transmission phase m...,"[India, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[india, leaves, behind, local, transmission, p...","[(india, NN), (leaves, VBZ), (behind, IN), (lo...","[(india, n), (leaves, v), (behind, n), (local,...","[india, leave, behind, local, transmission, ph...","[india, leave, behind, local, transmission, ph..."
20744,Here s my side of the story as a small busines...,Policy/Economy,"[Here, s, my, side, of, the, story, as, a, sma...",Here s my side of the story as a small busines...,"[Here, s, my, side, of, the, story, as, a, sma...","[here, s, my, side, of, the, story, as, a, sma...","[side, story, small, businessman]","[(side, JJ), (story, NN), (small, JJ), (busine...","[(side, a), (story, n), (small, a), (businessm...","[side, story, small, businessman]","[side, story, small, businessman]"
20745,Indian astronauts training on hold in Russia d...,Coronavirus,"[Indian, astronauts, training, on, hold, in, R...",Indian astronauts training on hold in Russia d...,"[Indian, astronauts, training, on, hold, in, R...","[indian, astronauts, training, on, hold, in, r...","[indian, astronauts, training, hold, russia, d...","[(indian, JJ), (astronauts, NNS), (training, V...","[(indian, a), (astronauts, n), (training, v), ...","[indian, astronaut, train, hold, russia, due, ...","[indian, astronaut, train, hold, russia, due, ..."
20746,COVID Government Threatens Kashmir Doctors Wit...,Coronavirus,"[COVID, Government, Threatens, Kashmir, Doctor...",COVID Government Threatens Kashmir Doctors Wit...,"[COVID, Government, Threatens, Kashmir, Doctor...","[covid, government, threatens, kashmir, doctor...","[covid, government, threatens, kashmir, doctor...","[(covid, JJ), (government, NN), (threatens, VB...","[(covid, a), (government, n), (threatens, v), ...","[covid, government, threaten, kashmir, doctor,...","[covid, government, threaten, kashmir, doctor,..."


## Saving the preprocessed data

In [21]:
data.to_pickle("data_cleaned.pkl")