# Introduction to Natural Language Process


## Wikipedia

wikipedia package: https://towardsdatascience.com/wikipedia-api-for-python-241cfae09f1c

In [None]:
#!pip install wikipedia

In [None]:
import wikipedia

In [None]:
# resultados
wikipedia.search('Fake News')

In [None]:
# suggestion
print(wikipedia.suggest('Fake ne'))

In [None]:
# summary
wikipedia.summary('Fake News')

In [None]:
# DesambiguationError
#wikipedia.summary('News')

In [None]:
# set language
wikipedia.set_lang('pt')
wikipedia.summary('Fake News')

In [None]:
# language support
#wikipedia.languages()

In [None]:
# access page
fake = wikipedia.page('Fake News')
print('title')
print(fake.title)

print('url')
print(fake.url)

print('content')
print(fake.content)

print('images')
print(fake.images)

print('links')
print(fake.links)

## Regex

In [None]:
# package regex
import re

In [None]:
content = fake.content

In [None]:
# split words by space
spaces = r'\s+'
split_spaces = re.split(spaces, content)
len(split_spaces)

In [None]:
# numbers
digits = r'\d+'
finded_digits = re.findall(digits, content)
finded_digits

In [None]:
# dots and commas
dots_coma = r'[.,]'
split_dotcoma = re.split(dots_coma, content)

In [None]:
comma_space = r'[,]\s+'
split_comma_space = re.split(comma_space, content)

In [None]:
# find all 'fake news'
re.findall('fake [A-Za-z]*', content)  # use all letters

In [None]:
# re.match: return the first match of a substring found, but re.match searches only from the beginning of the string
# re.search: _same_, but searches for the whole string
print(re.match('news', content))
print(re.search('news', content))
print(re.search('politico', content))

In [None]:
# start and end method
news = re.search('news', content)
print(news.start(), news.end())

In [None]:
# search for anything in parenteses
pattern = r'\(.*?\)'
print('re.search:', re.search(pattern, content))
print('re.findall:', re.findall(pattern, content)) 

In [None]:
content[16:77]

In [None]:
# search for anything in quotations marks
pattern_qm = r'"(.*?)"'
print('re.search:', re.search(pattern_qm, content))
print('re.findall:', re.findall(pattern_qm, content)) 

In [None]:
# seach for sections between "=="
pattern_qm = r'\==(\s.*\s)\=='                    # () define groups of patterns, [] define explicity characters
print('re.search:', re.search(pattern_qm, content))
print('re.findall sections:', re.findall(pattern_qm, content)) 

## Tokenization
 Transformar strings e documentos into tokens (smaller chunks)

 Library: NLTK - Natural Language Toolkit


In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
# tokenize sentences
sentences = nltk.tokenize.sent_tokenize(content, 'portuguese')

In [None]:
len(sentences)

In [None]:
# tokenize wordds
first_tokens = nltk.word_tokenize(sentences[0], 'portuguese')   

In [None]:
len(first_tokens)

In [None]:
# tokenize all text - unique
len(set(nltk.word_tokenize(content)))

## Charts

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
len(nltk.regexp_tokenize(content, '\w+'))

In [None]:
len_words = [len(w) for w in set(nltk.regexp_tokenize(content, '\w+'))]

In [None]:
plt.hist(len_words)

# Bag of Words

- basic method for finding topics in a text 

In [None]:
# verifying variables in environment
%whos

In [None]:
# import Counter
from collections import Counter

# tokens
tokens = nltk.word_tokenize(content)

# creating a counter
count_tokens = Counter(tokens)

# all elements of counter
count_tokens.elements

# remove zero and negative counts
+count_tokens

# most commons
count_tokens.most_common(10)


## Simple preprocessing

- tokenization
- lowercasing 
- lemmatization/stemming
- removing stopwords, punctuation, or unwanted tokens

In [None]:
nltk.download('stopwords')

In [None]:
# module for stopwords
from nltk.corpus import stopwords

# only portuguse stopwords
pt_stopwords = set(stopwords.words('portuguese'))

In [None]:
# tokenization and lowercasing at same time
tokens_lower = [w for w in nltk.word_tokenize(content.lower()) if w.isalpha()]

In [None]:
# remove stop words
no_stops = [t for t in tokens_lower if t not in pt_stopwords]

In [None]:
# counter
no_stops_counter = Counter(no_stops)
no_stops_counter.most_common(5)

In [None]:
# download wordnet
nltk.download('wordnet')

In [None]:
# stemming instance
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

# act
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# new counter with words lemmatized
stem_counter = Counter(lemmatized)
stem_counter.most_common(5)

In [None]:
# update stopwords without 'ser', 'sobre'
pt_stopwords.update(['ser', 'sobre'])

# again
no_stops = [t for t in tokens_lower if t not in pt_stopwords]

# again update
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# new counter with words lemmatized - update no_stops
stem_counter = Counter(lemmatized)
stem_counter.most_common(5)

# Gensim - word vector or word embedding

Transform words in numbers (0 and 1), to calculate distance between words, represented in a multi-dimensional array. We can see the relations between words. We use a dictionary to create a corpus of tokens id and frequency of each id.

_"Word vectors are multi-dimensional mathematical representations of words created using deep learning methods. They give us insight into relationships between words in a corpus."_

In order to work on text documents, Gensim requires the words (aka tokens) be converted to unique ids. In order to achieve that, Gensim lets you create a Dictionary object that maps each word to a unique id. Corpus is a 'collection of documents as a bag of words’

reference site: https://www.machinelearningplus.com/nlp/gensim-tutorial/

### Dictionary considering each sentence a document

In [None]:
# create a gensim dictionary
from gensim.corpora.dictionary import Dictionary

# creating differences 'documents' - it's sentences of fake news content
docs = [nltk.word_tokenize(c.lower()) for c in nltk.sent_tokenize(content)]

# removing stopwords and punctuation (isalpha())
doc_no_stop = []
docs_no_stop = []

for doc in docs:
    doc_no_stop = [d for d in doc if d not in pt_stopwords if d.isalpha()]
    docs_no_stop.append(doc_no_stop)
    
# stemming words in docs
doc_stem = []
docs_stem = []

for doc in docs_no_stop:
    doc_stem = [wordnet_lemmatizer.lemmatize(d) for d in doc]
    docs_stem.append(doc_stem)

# creating id for each token
dictionary = Dictionary(docs_stem)

# print five tokens - between 100 and 105
print(list(dictionary.token2id.items())[100:105])

 You can update an existing dictionary with other docs with `dictionary.add_documents(text)`

In [None]:
# create a gensim corpus of docs
import random

corpus_sent_docs = [dictionary.doc2bow(doc) for doc in docs_stem]

# print a random sample of one document in corpus
random.sample(corpus_sent_docs, 1)

In [None]:
# top 5 words
from collections import defaultdict
import itertools

# if not have the key, value is 0
total_word_count = defaultdict(int)

# sum of words
for word_id, word_count in itertools.chain.from_iterable(corpus_sent_docs):
    total_word_count[word_id] += word_count
    
# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

### Dictionary considering whole text content

In [None]:
content_no_stops = ' '.join(no_stops)

In [None]:
# simple_preprocess is necessary to create dictionary with single file/text
from gensim.utils import simple_preprocess

# 'gambiarra' in list([content])
dict_single_text = Dictionary(simple_preprocess(c) for c in list([content_no_stops]))

In [None]:
print(random.sample(list(dict_single_text.token2id.items()), 5))

In [None]:
# get specific id
dict_single_text.token2id.get('notícias')

In [None]:
# class for single text corpus

class BoWCorpus(object):
    def __init__(self, str_, dictionary):
        self.str = str_
        self.dictionary = dictionary

    def __iter__(self):
        
        for line in self.str:
            # tokenize
            tokenized_list = simple_preprocess(line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)

            # lazy return the BoW
            yield bow

In [None]:
bow_corpus_singlet = BoWCorpus(list([content_no_stops]), dictionary=dict_single_text)

In [None]:
# print five first 
bow_corpus = [line for line in bow_corpus_singlet][0]
print(bow_corpus[:5])

In [None]:
# sorted by scond item in list of tuples, to get most frequent words
sort_corpus = sorted(bow_corpus, key= lambda w: w[1], reverse=True)

# print the top 5 words
for word_id, word_count in sort_corpus[:5]:
    print('word:', dict_single_text[word_id]+',', 'count:',word_count)

# Twitter Analysis


In [None]:
'''
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"
# Use the pattern on the first tweet in the tweets list
hashtags = regexp_tokenize(tweets[0], pattern1)
print(hashtags)
'''


In [None]:
'''
# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"([@#]\w+)"
# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[-1], pattern2)
print(mentions_hashtags)
'''

In [None]:
'''
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)
'''
