In [1]:
import pandas as pd
import string
df = pd.read_csv('Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)
df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


In [2]:
# convert the text to lowercase
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,factsguide: society now #motivation,factsguide: society now #motivation


##### Remove Punctuations

In [3]:
# remove punctuations
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [4]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,factsguide: society now #motivation,factsguide society now motivation


##### Remove Stopwords

In [5]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [6]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide society motivation


In [8]:
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1
        
word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

##### Remove Frequent Words

In [9]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [10]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


##### Remove Rare Words

In [11]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [12]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [13]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


##### Remove Special Character

In [14]:
import re
def remove_spl_chars(text):
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [15]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


##### Stemming

In [16]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [17]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


##### Lemmatuzation & POS Tagging

In [18]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [19]:
wordnet.ADV

'r'

In [20]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run,father dysfunctional selfish drag kid dysfunct...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty,bihday majesti,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv,factsguide society motivation


In [21]:
df.sample(frac=1).head(10) # Đảo ngẫu nhiên dữ liệu

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
24777,good morning ð¼ #goodmorning #bomdia #break...,good morning goodmorning bomdia breakfast morn...,good morn goodmorn bomdia breakfast morn coffe...,good morning goodmorning bomdia breakfast morn...
25960,a really tweet,really tweet,realli tweet,really tweet
16538,chile. we knew this would happen. #tonyawards ...,chile knew would happen tonyawards hamilton drink,chile knew would happen tonyaward hamilton drink,chile knew would happen tonyawards hamilton drink
10068,#teachers did a silly dance so my students w...,teachers silly dance students wanted dance pha...,teacher silli danc student want danc pharrel,teacher silly dance student want dance pharrell
7469,looking forward to read this book... ððð...,looking forward read book hecroro weddingvows ...,look forward read book hecroro weddingvow happi,look forward read book hecroro weddingvows hap...
13180,hey @user thanks for delivering those two laug...,hey thanks delivering two laughs coincidentall...,hey thank deliv two laugh coincident receiv tw...,hey thanks deliver two laugh coincidentally re...
10262,#ifidontgetmyfirstchoice i'll #huff and i'll #...,ifidontgetmyfirstchoice ill huff ill puff ill ...,ifidontgetmyfirstchoic ill huff ill puff ill t...,ifidontgetmyfirstchoice ill huff ill puff ill ...
4477,#trump hooked the healand by validating their ...,trump hooked healand validating adultery inces...,trump hook healand valid adulteri incest pedop...,trump hook healand validate adultery incest pe...
1573,# users and growing fast! 1 business - 3 inc...,users growing fast 1 business 3 income streams...,user grow fast 1 busi 3 incom stream 5 minut p...,user grow fast 1 business 3 income stream 5 mi...
8446,came home to agent hazel in my apament. though...,came home agent hazel apament thought some1 br...,came home agent hazel apament thought some1 br...,come home agent hazel apament think some1 brea...


##### Remove URLs

In [22]:
text = "https://www.cuongpham.net is the URL of the Cuong's Web"

In [23]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text) 

remove_url(text)

" is the URL of the Cuong's Web"

##### Remove HTML Tags

In [24]:
text = "<html><body> <h1>Cuong Pham</h1> <p>This is NLP text preprocessing tutorial</p> </body></html>"

In [25]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text) 

remove_html_tags(text)

' Cuong Pham This is NLP text preprocessing tutorial '