# TEXT PREPROCESSING 


In [1]:
import pandas as pd
import numpy as np

### Reading the csv file

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [3]:
df_text = pd.read_csv(r'C:\Users\SD-16\Downloads\text_data_pratice.csv')
df_text

Unnamed: 0,response
0,I would love to stay at CW because I really lo...
1,"CW is a wonderful place to work in ,salary is ..."
2,"Even if I do plan to stay for the experience, ..."
3,its will great to stay with CW but you get sta...
4,I am constantly seeking new challenges and my ...
5,"Working environment is conducive, good team wo..."
6,CW is a moving train that improves every quart...
7,There is room for career growth and developmen...
8,"It is mentally stressful, this is because, we ..."


# Lowercasing
##### 
The lowercasing is an important text preprocessing step in which we convert the text into the same casing

In [4]:
df_text['response']=df_text['response'].str.lower()
df_text.head()

Unnamed: 0,response
0,i would love to stay at cw because i really lo...
1,"cw is a wonderful place to work in ,salary is ..."
2,"even if i do plan to stay for the experience, ..."
3,its will great to stay with cw but you get sta...
4,i am constantly seeking new challenges and my ...


##  Remove Extra Whitespaces

A text may contain extra whitespace which is not desired as they increase the text size and not add any value to the data
In Python, we can do this by splitting the text and joining it back on the basis of single whitespace

In [5]:
def remove_whitespace(text):
    return  " ".join(text.split())


df_text['response']=df_text['response'].apply(remove_whitespace)
df_text

Unnamed: 0,response
0,i would love to stay at cw because i really lo...
1,"cw is a wonderful place to work in ,salary is ..."
2,"even if i do plan to stay for the experience, ..."
3,its will great to stay with cw but you get sta...
4,i am constantly seeking new challenges and my ...
5,"working environment is conducive, good team wo..."
6,cw is a moving train that improves every quart...
7,there is room for career growth and developmen...
8,"it is mentally stressful, this is because, we ..."


## Tokenization 


This is the process of splitting text into pieces called tokens. 
A corpus of text can be converted into tokens of sentences, words, or even characters

In [6]:
from nltk import word_tokenize
df_text['response']=df_text['response'].apply(lambda X: word_tokenize(X))
df_text.head()

Unnamed: 0,response
0,"[i, would, love, to, stay, at, cw, because, i,..."
1,"[cw, is, a, wonderful, place, to, work, in, ,,..."
2,"[even, if, i, do, plan, to, stay, for, the, ex..."
3,"[its, will, great, to, stay, with, cw, but, yo..."
4,"[i, am, constantly, seeking, new, challenges, ..."


## Spelling correction


This ensures we get better results from our model

In [7]:
from spellchecker import SpellChecker
def spell_check(text):
    
    result = []
    spell = SpellChecker()
    for word in text:
        correct_word = spell.correction(word)
        result.append(correct_word)
    
    return result

ModuleNotFoundError: No module named 'indexer'

## Stop words

Stopwords are trivial words like “I”, “the”, “you”, etc. that appear so frequently in the text that they may distort \
many NLP operations without adding much valuable information. \
So almost always you will have to remove stopwords from the corpus as part of your preprocessing.


In [8]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
en_stopwords = stopwords.words('english')

def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_stopwords:
            result.append(token)
            
    return result

In [10]:
df_text['response'] = df_text['response'].apply(remove_stopwords)
df_text.head()

Unnamed: 0,response
0,"[would, love, stay, cw, really, love, ., howev..."
1,"[cw, wonderful, place, work, ,, salary, attrac..."
2,"[even, plan, stay, experience, ,, terms, job, ..."
3,"[great, stay, cw, get, stagnated, single, spot..."
4,"[constantly, seeking, new, challenges, stay, d..."


## Removing Punctuations

Removing punctuation is an important text preprocessing step as it also does not add any value to the information

In [11]:
from nltk.tokenize import RegexpTokenizer

def remove_punct(text):
    
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst

In [12]:
df_text['response'] = df_text['response'].apply(remove_punct)
df_text.head()

Unnamed: 0,response
0,"[would, love, stay, cw, really, love, however,..."
1,"[cw, wonderful, place, work, salary, attractiv..."
2,"[even, plan, stay, experience, terms, job, con..."
3,"[great, stay, cw, get, stagnated, single, spot..."
4,"[constantly, seeking, new, challenges, stay, d..."


## Removal of Accented Characters

Accented characters are important elements which are used to signify emphasis on a particular word during pronunciation or understanding. In some instances, the accent mark also clarifies the meaning of a word, which might be different without the accent. While their use in English is largely limited but there are very good chances that you will come across accented characters/letters in a free text corpus. Words such as résumé, café, prótest, divorcé, coördinate, exposé, latté etc.

In [13]:
# imports
import unicodedata
# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text


In [14]:
# call function
remove_accented_chars('Sómě Áccěntěd těxt. Some words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.')

'Some Accented text. Some words such as resume, cafe, protest, divorce, coordinate, expose, latte.'

## Expanding contraction words

Nowadays, many editors will induce contractions by default. For examples do not to don’t, I would to I’d, you are to you’re. Converting each contraction to its expanded, original form helps with text standardization

In [None]:
# imports
from pycontractions import Contractions
cont = Contractions(kv_model=model)
cont.load_models()
# function to expand contractions
def expand_contractions(text):
    text = list(cont.expand_texts([text], precise=True))[0]
    return text

In [None]:
# call function 
expand_contractions(“Y’all i’d contractions you’re expanded don’t think.”)

## Lemmatization

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

In [17]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(text):
    
    result=[]
    wordnet = WordNetLemmatizer()
    for token in text:
        result.append(wordnet.lemmatize(token))
    
    return result

In [18]:
df_text['response']=df_text['response'].apply(lemmatization)
df_text.head()

Unnamed: 0,response
0,"[would, love, stay, cw, really, love, however, thinking, fact, best, job, last, 3, year, without, real, growth, career, wise, bit, discouraging, still, thing, since, came, despite, many, recongnition, commendation, happy, working]"
1,"[cw, wonderful, place, work, salary, attractive, secure, neat, conducive, working, environment, management, employee, heart, mould, character, relate, provide, solution, problem, arising, others]"
2,"[even, plan, stay, experience, term, job, constantly, keep, toe, constantly, seeking, employment, elsewhere, afford, sudden, termination, due, performance, ve, got, bill, sort, department, good, place, stay, long, toxic, nature, job, n, t, one, deal, long, mental, emotional, stability, real, thing]"
3,"[great, stay, cw, get, stagnated, single, spot, long, time, create, temptation, leaving, get, better, offer]"
4,"[constantly, seeking, new, challenge, stay, depends, valuable, company, find, always, seek, improve, work, ethic, place, good, stead, leadership, position]"


This looks quite naive. words are still ending with -ing, ed, ly and this is not supposed to be so

So, now, we will make use of POS argument and try to lemmatize again and test a few variations

In [20]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(text):
    
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result

In [21]:
df_text['response']=df_text['response'].apply(lemmatization)
df_text.head()

Unnamed: 0,response
0,"[would, love, stay, cw, really, love, however, think, fact, best, job, last, 3, year, without, real, growth, career, wise, bit, discourage, still, thing, since, come, despite, many, recongnition, commendation, happy, working]"
1,"[cw, wonderful, place, work, salary, attractive, secure, neat, conducive, work, environment, management, employee, heart, mould, character, relate, provide, solution, problem, arise, others]"
2,"[even, plan, stay, experience, term, job, constantly, keep, toe, constantly, seek, employment, elsewhere, afford, sudden, termination, due, performance, ve, get, bill, sort, department, good, place, stay, long, toxic, nature, job, n, t, one, deal, long, mental, emotional, stability, real, thing]"
3,"[great, stay, cw, get, stagnate, single, spot, long, time, create, temptation, leave, get, better, offer]"
4,"[constantly, seek, new, challenge, stay, depend, valuable, company, find, always, seek, improve, work, ethic, place, good, stead, leadership, position]"


## Stemming
Stemming is used to reduce different grammatical forms or word forms of a word like its noun, adjective, verb, adverb etc. to its root form. Computationally, it is a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. It is important for information retrieval systems.

In [23]:
# Using Porter Stemmer implementation in nltk
from nltk.stem import PorterStemmer

def stemming(text):
    porter = PorterStemmer()
    
    result=[]
    for word in text:
        result.append(porter.stem(word))
    return result

In [24]:
df_text['response']=df_text['response'].apply(stemming)
df_text.head()

Unnamed: 0,response
0,"[would, love, stay, cw, realli, love, howev, think, fact, best, job, last, 3, year, without, real, growth, career, wise, bit, discourag, still, thing, sinc, come, despit, mani, recongnit, commend, happi, work]"
1,"[cw, wonder, place, work, salari, attract, secur, neat, conduc, work, environ, manag, employe, heart, mould, charact, relat, provid, solut, problem, aris, other]"
2,"[even, plan, stay, experi, term, job, constantli, keep, toe, constantli, seek, employ, elsewher, afford, sudden, termin, due, perform, ve, get, bill, sort, depart, good, place, stay, long, toxic, natur, job, n, t, one, deal, long, mental, emot, stabil, real, thing]"
3,"[great, stay, cw, get, stagnat, singl, spot, long, time, creat, temptat, leav, get, better, offer]"
4,"[constantli, seek, new, challeng, stay, depend, valuabl, compani, find, alway, seek, improv, work, ethic, place, good, stead, leadership, posit]"


Note: Stemming is rule based and thus we can see the changes like "realli" and "howev" in index 0 which does not make sense. Also, notice all the words are already transformed into lower case. This poses a challenge for proper noun detection because the only significant physical notation - the first letter in upper case - will not more be in the data.