# [Text pre processing](https://github.com/kokchun/Deep-learning-AI21/blob/main/Lectures/Lec7-Text_preprocessing.ipynb)

In [1]:
import pyjokes
import nltk

In [6]:
jokes = pyjokes.get_jokes()
print(len(jokes))
raw_text = f"{jokes[1]}\n{jokes[10]}\n{jokes[5]}"
print(raw_text)

97
Ubuntu users are apt to get this joke.
'Knock, knock.' 'Who's there?' ... very long pause ... 'Java.'
An SQL query goes into a bar, walks up to two tables and asks, 'Can I join you?'


In [9]:
text = raw_text.lower()
print(text)

ubuntu users are apt to get this joke.
'knock, knock.' 'who's there?' ... very long pause ... 'java.'
an sql query goes into a bar, walks up to two tables and asks, 'can i join you?'


## Tokenize

- Sentence tokenization
- Word tokenization
- Character tokenization

In [12]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

['ubuntu users are apt to get this joke.', "'knock, knock.'", "'who's there?'", '... very long pause ...', "'java.'", "an sql query goes into a bar, walks up to two tables and asks, 'can i join you?'"]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/creativezone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
from nltk.tokenize import word_tokenize

word_tokens = word_tokenize(text)
print(word_tokens)

['ubuntu', 'users', 'are', 'apt', 'to', 'get', 'this', 'joke', '.', "'knock", ',', 'knock', '.', "'", "'who", "'s", 'there', '?', "'", '...', 'very', 'long', 'pause', '...', "'java", '.', "'", 'an', 'sql', 'query', 'goes', 'into', 'a', 'bar', ',', 'walks', 'up', 'to', 'two', 'tables', 'and', 'asks', ',', "'can", 'i', 'join', 'you', '?', "'"]


In [17]:
words_in_sentence_tokens = [word_tokenize(sentence) for sentence in sent_tokenize(text)]
print(words_in_sentence_tokens)

[['ubuntu', 'users', 'are', 'apt', 'to', 'get', 'this', 'joke', '.'], ["'knock", ',', 'knock', '.', "'"], ["'who", "'s", 'there', '?', "'"], ['...', 'very', 'long', 'pause', '...'], ["'java", '.', "'"], ['an', 'sql', 'query', 'goes', 'into', 'a', 'bar', ',', 'walks', 'up', 'to', 'two', 'tables', 'and', 'asks', ',', "'can", 'i', 'join', 'you', '?', "'"]]


## Remove noise

- Digits
- Stop words
- Punctuations

In [26]:
import string

punctuations = string.punctuation + '...'
print(punctuations)
tokens_no_punctuations = [token for token in word_tokens if not token in punctuations]
print(tokens_no_punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~...
['ubuntu', 'users', 'are', 'apt', 'to', 'get', 'this', 'joke', "'knock", 'knock', "'who", "'s", 'there', 'very', 'long', 'pause', "'java", 'an', 'sql', 'query', 'goes', 'into', 'a', 'bar', 'walks', 'up', 'to', 'two', 'tables', 'and', 'asks', "'can", 'i', 'join', 'you']


In [24]:
from nltk.corpus import stopwords

nltk.download('stopwords')

print(stopwords.words('swedish'))

['och', 'det', 'att', 'i', 'en', 'jag', 'hon', 'som', 'han', 'på', 'den', 'med', 'var', 'sig', 'för', 'så', 'till', 'är', 'men', 'ett', 'om', 'hade', 'de', 'av', 'icke', 'mig', 'du', 'henne', 'då', 'sin', 'nu', 'har', 'inte', 'hans', 'honom', 'skulle', 'hennes', 'där', 'min', 'man', 'ej', 'vid', 'kunde', 'något', 'från', 'ut', 'när', 'efter', 'upp', 'vi', 'dem', 'vara', 'vad', 'över', 'än', 'dig', 'kan', 'sina', 'här', 'ha', 'mot', 'alla', 'under', 'någon', 'eller', 'allt', 'mycket', 'sedan', 'ju', 'denna', 'själv', 'detta', 'åt', 'utan', 'varit', 'hur', 'ingen', 'mitt', 'ni', 'bli', 'blev', 'oss', 'din', 'dessa', 'några', 'deras', 'blir', 'mina', 'samma', 'vilken', 'er', 'sådan', 'vår', 'blivit', 'dess', 'inom', 'mellan', 'sådant', 'varför', 'varje', 'vilka', 'ditt', 'vem', 'vilket', 'sitta', 'sådana', 'vart', 'dina', 'vars', 'vårt', 'våra', 'ert', 'era', 'vilkas']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/creativezone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Stemming

Convert words into root word

In [27]:
from nltk import SnowballStemmer, LancasterStemmer

snowball_stemmer = SnowballStemmer('english')
lancaster_stemmer = LancasterStemmer()

snowball_tokens = [snowball_stemmer.stem(token) for token in tokens_no_punctuations]
lancaster_tokens = [lancaster_stemmer.stem(token) for token in tokens_no_punctuations]

print(f'Original {tokens_no_punctuations}\n')
print(f'Snowball {snowball_tokens}\n')
print(f'Lancaster {lancaster_tokens}')

Original ['ubuntu', 'users', 'are', 'apt', 'to', 'get', 'this', 'joke', "'knock", 'knock', "'who", "'s", 'there', 'very', 'long', 'pause', "'java", 'an', 'sql', 'query', 'goes', 'into', 'a', 'bar', 'walks', 'up', 'to', 'two', 'tables', 'and', 'asks', "'can", 'i', 'join', 'you']

Snowball ['ubuntu', 'user', 'are', 'apt', 'to', 'get', 'this', 'joke', 'knock', 'knock', 'who', "'s", 'there', 'veri', 'long', 'paus', 'java', 'an', 'sql', 'queri', 'goe', 'into', 'a', 'bar', 'walk', 'up', 'to', 'two', 'tabl', 'and', 'ask', 'can', 'i', 'join', 'you']

Lancaster ['ubuntu', 'us', 'ar', 'apt', 'to', 'get', 'thi', 'jok', "'knock", 'knock', "'who", "'s", 'ther', 'very', 'long', 'paus', "'java", 'an', 'sql', 'query', 'goe', 'into', 'a', 'bar', 'walk', 'up', 'to', 'two', 'tabl', 'and', 'ask', "'can", 'i', 'join', 'you']


## Lemmatiazation

Remove inflection

In [37]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet2021

nltk.download('wordnet21')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemma = WordNetLemmatizer()
lemma.lemmatize('thicker', wordnet2021.ADJ)

[nltk_data] Error loading wordnet21: Package 'wordnet21' not found in
[nltk_data]     index
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/creativezone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/creativezone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


'thick'