In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import nltk
import ssl
import gensim
import pandas as pd
import math
import pickle
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fagnercandido/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/fagnercandido/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import dash
from dash.dependencies import Input, Output, State
from dash import html
from dash import dcc
from jupyter_dash import JupyterDash
from dash import Dash, html, dcc
import flask

In [3]:
def get_eligible_files():
    elegible_files = []
    for file in os.listdir('reuters21578'):
        if file.endswith('.sgm'):
            elegible_files.append(file)
    return elegible_files

def get_documents():
    documents = []
    files = get_eligible_files()
    for file in files:
        content = ''
        try:
            content = open(f'reuters21578/{file}', 'r', encoding="utf-8").read()
        except UnicodeDecodeError as e:
            lines = []
            for line in open(f'reuters21578/{file}', 'rb').readlines():
                line = line.decode('utf-8','ignore') #.encode("utf-8")
                lines.append(line)
            content = '\n'.join(lines)
        soup_document = BeautifulSoup(content.lower(), 'html.parser')
        for document in soup_document.findAll('reuters'):
            documents.append(document)
    return documents

def get_documents_by_dictionary_article():
    articles = {}
    for article in get_documents():
        new_id = article.get('newid')
        date_article = find_element_by_name(article, 'date')
        topics = find_element_by_name(article, 'topics')
        places = find_element_by_name(article, 'places')
        people = find_element_by_name(article, 'people')
        title = find_element_by_name(article, 'title')
        dateline = find_element_by_name(article, 'dateline')
        body = find_element_by_name(article, 'body')
        metadados_article = {}
        for item in ['date_article', 'dateline', 'topics', 'places', 'people', 'title', 'body']:
            metadados_article[item] = eval(item)
        articles[new_id] = metadados_article
    return articles

def find_element_by_name(article, tag):
    element = article.find_all(tag)
    if element:
        return next((x.text for x in element))
    else:
        return ''
    
def preprocess_tokenize_and_lemmatize_and_add_frequency_matrix():
    articles = get_documents_by_dictionary_article()
    for item, value in articles.items():
        articles[item]['body_lemmatized'] = remove_stop_word_and_lemmatize(value['body'])
        articles[item]['topics_lemmatized'] = remove_stop_word_and_lemmatize(value['topics'], True) 
        articles[item]['body_frequency'] = create_frequency_matrix(articles[item]['body_lemmatized'])
        articles[item]['body_tfidf'] = create_tf_idf(value['body'])
    return articles

def create_tf_idf(value):
    if value:
        cv = CountVectorizer()
        word_count_vector = cv.fit_transform(list(filter(None, value.split('\n'))))
        
        tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
        tfidf_transformer.fit(word_count_vector)
        
        count_vector = cv.transform(list(filter(None, value.split('\n')))) 
        tf_idf_vector = tfidf_transformer.transform(count_vector)
        
        feature_names = cv.get_feature_names()
        first_document_vector=tf_idf_vector[0]
        dataframe = pd.DataFrame(first_document_vector.T.todense(), index = feature_names, columns=["tfidf"])
        dataframe = dataframe.sort_values(by = ["tfidf"], ascending=False)
        
        dataframe.dropna(inplace = True)
        dataframe_to_dict = dataframe.to_dict()
        return list(dataframe_to_dict.values())[0]
    else:
        return []

def remove_stop_word_and_lemmatize(value, is_topic = False):
    result = []
    if value:
        for token in gensim.utils.simple_preprocess(value):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v'))
    return set(result) if is_topic else result

def create_frequency_matrix(value):
    frequency_table = {}
    for word in value:
        if word in frequency_table:
            frequency_table[word] += 1
        else:
            frequency_table[word] = 1
    return frequency_table

def search(query_query, articles):
    articles_selected = {}
    query_string_lemmatized = remove_stop_word_and_lemmatize(query_string)
    for token in query_string_lemmatized:
        for key, value in articles.items():
            list_of_values = list(value['body_frequency'].keys())
            if token in list_of_values and list_of_values.index(token) > 0.0:
                if key in articles_selected:
                    articles_selected[key]['relative_weight']  = articles_selected[key]['relative_weight'] + list_of_values.index(token)
                else:
                    articles_selected[key] = {'article': value, 'relative_weight': list_of_values.index(token)}
    prepare_visualization(sorted(articles_selected.items(), key=lambda x: x[1]['relative_weight'], reverse=True))

def prepare_visualization(articles):
    print(f'\t\tMy Favorite Engine')
    print()
    for item in articles:
        value = item[1]
        print(f'{value["article"]["title"]} - {value["article"]["date_article"]}')
        print(f'{value["article"]["body"][:50]}...')
        print()

In [4]:
articles = preprocess_tokenize_and_lemmatize_and_add_frequency_matrix()

In [5]:
query_string = 'music'

In [6]:
search(query_string, articles)

		My Favorite Engine

digital audio tape players go on sale in japan -  2-mar-1987 07:20:13.63
japanese consumers hesitated about buying
the cont...

carolco <crc> may bid for lieberman <lman.o> -  2-jun-1987 08:32:00.28
lieberman enterprises inc said

carolco pictures i...

international <icc.o> sets new sound technology -  2-jun-1987 14:28:36.79
international cablecasting technologies
inc, said ...

electrosound <esg> says official has resigned -  2-jun-1987 13:51:03.09
electrosound group inc said
richard meixner has re...

hawkeye entertainment <sbiz> completes offering - 26-mar-1987 17:01:22.82
hawkeye entertainment inc said it
completed its in...

chadian troops reportedly recapture faya - 27-mar-1987 13:14:19.81v rm y
chadian troops have recaptured the
northern oasis ...

musikahn corp files for reorganization - 12-mar-1987 16:03:01.84
<musikahn corp> said it filed a
reorganization pla...

sony plans video and tape production in thailand - 17-apr-1987 11:47:35.61
<sony corp> o

In [7]:
#DASH

In [None]:
server = flask.Flask('app')

app = dash.Dash('app', server = server)
app.css.config.serve_locally = True
app.scripts.config.serve_locally = True

app.layout = html.Main([
     html.Link(
        rel='stylesheet',
        href='/static/index.css'
    ),
    html.Center([
        html.Img(src="/static/foogle.png", width="20%", height="5%", id="googleimg"),
        html.Div([
            html.Div([
                html.Span([
                    dcc.Input(name="search", type="text", placeholder="O que gostava de saber?"),
                ], id="inputspan"),
            ], id="maindiv")
        ]),
        html.Section([
            html.Div([
                html.Button('Pesquisa Foogle', id='search-valu<e', n_clicks=0),
            ]),
            html.Div([
                html.Button('Estou com sorte', id='lucky-value', n_clicks=0),
            ]),
        ]),
    ]),   
])

@app.server.route('/static/<path:path>')
def static_file(path):
    static_folder = os.path.join(os.getcwd(), 'static')
    return send_from_directory(static_folder, path)


if __name__ == '__main__':
    app.run_server()

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "app" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [22/May/2022 23:26:06] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2022 23:26:06] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2022 23:26:06] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2022 23:26:06] "GET /static/foogle.png HTTP/1.1" 200 -
