In [None]:
!pip install pyngrok
!pip install flask



In [None]:
!pip install transformers[sentencepiece] datasets spacy scipy networkx numpy sent2vec pyngrok pandas nltk pdfplumber



In [None]:
# import library yang dibutuhkan
import torch
import spacy
import networkx as nx
import numpy as np
import pdfplumber

from scipy import spatial
from sent2vec.vectorizer import Vectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import TFPegasusForConditionalGeneration, PegasusTokenizerFast
from transformers import BertTokenizer, BertModel

import re
import unicodedata

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# 1. FUNGSI PRE-PROCESSING
def preprocess_en(text):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)

  # Segmentasi (pisahkan text per kalimat dan masukkan kedalam list)
  sentences = [sent.text.strip() for sent in doc.sents]

  # Membersihkan setiap kalimat
  filtered_sentences = []
  for i in range(len(sentences)):
    doc = nlp(sentences[i])

    # inisial variabel untuk menampung kata yang bersih
    filtered_tokens = []

    # tokenisasi
    for token in doc:
    # menghilangkan stopword dan karakter yang tidak dibutuhkan
      if token.is_stop or token.is_punct:
        continue
      # lemmatisasi
      filtered_tokens.append(token.lemma_)

    # how to convert list to string
    result = " ".join(filtered_tokens)

    filtered_sentences.append(result)
  return filtered_sentences

In [None]:
# Load the tokenizer and model
tokenizer_bert_en = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert_en = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def vector_en(list_sentences):

  # Initialize an empty array to store sentence embeddings
  sentence_embeddings = []

  # Iterate through sentences and get embeddings
  for text in list_sentences:
      input_ids = tokenizer_bert_en.encode(text, add_special_tokens=True, padding=True, truncation=True)
      input_ids = torch.tensor(input_ids).unsqueeze(0)

      with torch.no_grad():
          outputs = model_bert_en(input_ids)
          cls_embedding = outputs.last_hidden_state[:, 0, :]
          embedding = cls_embedding[0]

      sentence_embeddings.append(embedding.numpy())

  return sentence_embeddings

In [None]:
# 2. FUNGSI PERINGKASAN EKSTRAKTIF
def extractive_sum_en(filtered_sentences, sentences):
  # mengubah kalimat menjadi vektor
  vectors = vector_en(filtered_sentences)

  # menghitung similarity matrix (matriks kemiripan antar kalimat)
  similarity_matrix = []
  for i in range(len(vectors)):
    row = []
    for j in range(len(vectors)):
      row.append(spatial.distance.cosine(vectors[i], vectors[j]))
    similarity_matrix.append(row)

  # konversi matrix menjadi graph
  graph = nx.from_numpy_array(np.array(similarity_matrix))

  # melakukan perangkingan
  scores = nx.pagerank(graph)

  # mengambil top rank (kalimat dengan score tertinggi)
  sentences_size = len(sentences)
  num_sentences = round((sentences_size + 1) / 2)

  top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
  summary = [sentences[i] for i in top_sentence_indices]
  # summary = " ".join(summary)

  # hasil peringkasan ekstraktif
  return summary


In [None]:
# 3. FUNGSI PERINGKASAN ABSTRAKTIF

model_name1 = "google/pegasus-cnn_dailymail"
model1 = PegasusForConditionalGeneration.from_pretrained(model_name1)
tokenizer1 = PegasusTokenizer.from_pretrained(model_name1)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def abstractive_sum_en(chunks):
  # melakukan peringkasan abstraktif
  summary = []

  # inputs to the model
  inputs = [tokenizer1(chunk, return_tensors="pt", max_length=1024, truncation=True) for chunk in chunks]

  for input in inputs:
    output = model1.generate(**input, max_length=300, min_length=100, length_penalty=1.5, num_beams=5, early_stopping=True)
    summary.append(tokenizer1.decode(*output, skip_special_tokens=True))

  summary = " ".join(summary)

  return summary

In [None]:
# 4. FUNGSI CHUNKS
def chunks_en(sentences):
  # initialize
  length = 0
  chunk = ""
  chunks = []
  count = -1
  for sentence in sentences:
    count += 1
    combined_length = len(tokenizer1.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

    if combined_length  <= tokenizer1.max_len_single_sentence: # if it doesn't exceed
      chunk += sentence + " " # add the sentence to the chunk
      length = combined_length # update the length counter

      # if it is the last sentence
      if count == len(sentences) - 1:
        chunks.append(chunk.strip()) # save the chunk

    else:
      chunks.append(chunk.strip()) # save the chunk

      # reset
      length = 0
      chunk = ""

      # take care of the overflow sentence
      chunk += sentence + " "
      length = len(tokenizer1.tokenize(sentence))
  return chunks

In [None]:
# MERINGKAS TEKS BAHASA INGGRIS
def summy_en(text):
  document = text
  # load spacy
  nlp = spacy.load("en_core_web_sm")

  # memasukkan kalimat kedalam list (untuk memudahkan print output)
  doc = nlp(document)
  sentences = [sent.text.strip() for sent in doc.sents]

  # melakukan pre-processing
  # contoh hasil penggunaan fungsi preprocess
  result = preprocess_en(document)

  # melakukan peringkasan ekstraktif
  summary1 = extractive_sum_en(result, sentences)

  # chunks
  chunks_data = chunks_en(summary1)

  # melakukan peringkasan abstraktif
  summary = abstractive_sum_en(chunks_data)
  summary = summary.replace("<pad>", "")
  summary = summary.replace("</s>", "")
  summary = summary.replace(".<n>", ".\n")

  return summary


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pdfplumber

# Buka file PDF
with pdfplumber.open('/content/gdrive/MyDrive/NLP/f_zero/artikel/Deep learning.pdf') as pdf:
    # Baca semua halaman dalam PDF
    for page in pdf.pages:
        # Ekstrak teks dari halaman
        text = page.extract_text()

        # Tampilkan teks dari halaman
        print(text)

Deep learning is a subset of machine learning, which is essentially a neural network with
three or more layers. These neural networks attempt to simulate the behavior of the
human brain—albeit far from matching its ability—allowing it to “learn” from large
amounts of data. While a neural network with a single layer can still make approximate
predictions, additional hidden layers can help to optimize and refine for accuracy.
Deep learning drives many artificial intelligence (AI) applications and services that
improve automation, performing analytical and physical tasks without human
intervention. Deep learning technology lies behind everyday products and services
(such as digital assistants, voice-enabled TV remotes, and credit card fraud detection)
as well as emerging technologies (such as self-driving cars).
If deep learning is a subset of machine learning, how do they differ? Deep learning
distinguishes itself from classical machine learning by the type of data that it works with
and

In [None]:
result_en = summy_en(text)
print(result_en)

Financial services use predictive analytics to drive algorithmic trading of stocks, assess business risks for loan approvals, detect fraud, and help manage credit and investment portfolios for clients.
Healthcare has benefited greatly from deep learning capabilities ever since the digitization of hospital records and images.
High performance graphical processing units (GPUs) are ideal because they can handle a large volume of calculations in multiple cores with copious memory available.
Management of multiple GPU on-premises can create a large demand on internal resources and be incredibly costly to scale.


PERINGKASAN B INDO

In [None]:
#1. PRE-PROCESSING
def preprocess_id(text):

  # Segmentasi (pisahkan text per kalimat dan masukkan kedalam list)
  sentences = sent_tokenize(text)

  # Membersihkan setiap kalimat
  filtered_sentences = []
  for i in range(len(sentences)):

    text = sentences[i]

    # Tokenisasi (memecah kalimat menjadi kata)
    words = word_tokenize(text.lower())

    # Membersihkan tanda baca
    words = [word for word in words if word.isalnum()]

    # Menghapus stopwords
    stop_words = set(stopwords.words("indonesian"))
    words = [word for word in words if word not in stop_words]

    # Melakukan lemmatisasi
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Menggabungkan kembali kata yang telah dibersihkan menjadi kalimat
    result = " ".join(lemmatized_words)

    filtered_sentences.append(result)
  return filtered_sentences

In [None]:
# Load the tokenizer and model
tokenizer_bert_id = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_bert_id = BertModel.from_pretrained('bert-base-multilingual-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
def vector_id(list_sentences):

  # Initialize an empty array to store sentence embeddings
  sentence_embeddings = []

  # Iterate through sentences and get embeddings
  for text in list_sentences:
      input_ids = tokenizer_bert_id.encode(text, add_special_tokens=True, padding=True, truncation=True)
      input_ids = torch.tensor(input_ids).unsqueeze(0)

      with torch.no_grad():
          outputs = model_bert_id(input_ids)
          cls_embedding = outputs.last_hidden_state[:, 0, :]
          embedding = cls_embedding[0]

      sentence_embeddings.append(embedding.numpy())

  return sentence_embeddings

In [None]:
# 2. PERINGKASAN EKSTRAKTIF

def extractive_sum_id(filtered_sentences, sentences):
  # mengubah kalimat menjadi vektor
  vectors = vector_id(filtered_sentences)

  # menghitung similarity matrix (matriks kemiripan antar kalimat)
  similarity_matrix = []
  for i in range(len(vectors)):
    row = []
    for j in range(len(vectors)):
      row.append(spatial.distance.cosine(vectors[i], vectors[j]))
    similarity_matrix.append(row)

  # konversi matrix menjadi graph
  graph = nx.from_numpy_array(np.array(similarity_matrix))

  # melakukan perangkingan
  scores = nx.pagerank(graph)

  # mengambil top rank (kalimat dengan score tertinggi)
  sentences_size = len(sentences)
  num_sentences = round((sentences_size + 1) / 2)

  top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
  summary = [sentences[i] for i in top_sentence_indices]

  # hasil peringkasan ekstraktif

  return summary

In [None]:
# 3. PERINGKASAN ABSTRAKTIF

# membersihkan teks
def text_cleaning(input_string):
    lowercase = input_string.lower()
    remove_link = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', lowercase).replace("&amp;","&")
    remove_bullet = "\n".join([T for T in remove_link.split('\n') if '•' not in T and "baca juga:" not in T])
    remove_accented = unicodedata.normalize('NFKD', remove_bullet).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    remove_parentheses = re.sub("([\(\|]).*?([\)\|])", "\g<1>\g<2>", remove_accented)
    remove_punc = re.sub(r"[^\w\d.\s]+",' ', remove_parentheses)
    remove_num_dot = re.sub(r"(?<=\d)\.|\.(?=\d)|(?<=#)\.","", remove_punc)
    remove_extra_whitespace =  re.sub(r'^\s*|\s\s*', ' ', remove_num_dot).strip()
    return ".".join([s for s in remove_extra_whitespace.strip().split('.') if len(s.strip())>10]).replace("_","")

In [None]:
model_name2 = "thonyyy/pegasus_indonesian_base-finetune"
model2 = TFPegasusForConditionalGeneration.from_pretrained(model_name2)
tokenizer2 = PegasusTokenizerFast.from_pretrained(model_name2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/764 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/525M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at thonyyy/pegasus_indonesian_base-finetune.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/822k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def abstractive_sum_id(chunks):
  summary = []

  clean_text = [text_cleaning(chunk) for chunk in chunks]
  chunks2 = clean_text
  # inputs to the model
  inputs = [tokenizer2(chunk, return_tensors="tf", truncation=True) for chunk in chunks2]

  for input in inputs:
    output = model2.generate(**input, max_new_tokens = 500, min_new_tokens = 300, length_penalty=2.0, num_beams=5, early_stopping=True)
    tempt = tokenizer2.batch_decode(output, skip_special_tokens=True)
    summary.append(tempt[0])

  summary = " ".join(summary)

  return summary

In [None]:
# 4. FUNGSI CHUNKS
def chunks_id(sentences):
  # initialize
  length = 0
  chunk = ""
  chunks = []
  count = -1
  for sentence in sentences:
    count += 1
    combined_length = len(tokenizer2.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

    if combined_length  <= tokenizer2.max_len_single_sentence: # if it doesn't exceed
      chunk += sentence + " " # add the sentence to the chunk
      length = combined_length # update the length counter

      # if it is the last sentence
      if count == len(sentences) - 1:
        chunks.append(chunk.strip()) # save the chunk

    else:
      chunks.append(chunk.strip()) # save the chunk

      # reset
      length = 0
      chunk = ""

      # take care of the overflow sentence
      chunk += sentence + " "
      length = len(tokenizer2.tokenize(sentence))
  return chunks

In [None]:
# MERINGKAS TEKS BAHASA
def summy_id(text):

  # memasukkan kalimat kedalam list (untuk memudahkan print output)
  sentences = sent_tokenize(text)

  # 1. PRE-PROCESSING
  result = preprocess_id(text)

  # 2. PERINGKASAN EKSTRAKTIF
  summary1 = extractive_sum_id(result, sentences)

  # 3. chunks
  chunks_data = chunks_id(summary1)

  # 4. PERINGKASAN ABSTRAKTIF
  summary = abstractive_sum_id(chunks_data)
  summary = summary.replace("<pad>", "")
  summary = summary.replace("</s>", "")
  summary = summary.replace(".<n>", ".\n")

  return summary

In [None]:
# Buka file PDF
with pdfplumber.open('/content/gdrive/MyDrive/NLP/f_zero/artikel/machine learning _id.pdf') as pdf:
    # Baca semua halaman dalam PDF
    for page in pdf.pages:
        # Ekstrak teks dari halaman
        text = page.extract_text()

        # Tampilkan teks dari halaman
        print(text)

Di tengah pesatnya perkembangan teknologi kecerdasan buatan atau artificial
intelligence (AI) saat ini. Belum banyak orang yang mengetahui bahwa kecerdasan
buatan itu terdiri dari beberapa cabang, salah satunya adalah machine
learning atau pembelajaran mesin. Teknologi machine learning (ML) ini merupakan
salah satu cabang dari AI yang sangat menarik perhatian, kenapa? Karena machine
learning merupakan mesin yang bisa belajar layaknya manusia.
Kembali pada kecerdasan buatan. Kecerdasan buatan pada pengaplikasiannya
secara garis besar terbagi tujuh cabang, yaitu machine learning, natural language
processing, expert system, vision, speech, planning dan robotics. Percabangan dari
kecerdasan buatan tersebut dimaksudkan untuk mempersempit ruang lingkup
saat pengembangan atau belajar AI, karena pada dasarnya kecerdasan buatan
memiliki ruang lingkup yang sangat luas.
Penjelasan lebih lengkap mengenai AI, kamu bisa membacanya pada artikel
berikut Apa Itu Kecerdasan Buatan? Berikut Pengertian da

In [None]:
result_id = summy_id(text)
print(result_id)

alphago akan memperbaiki cara bermain mereka dengan menggunakan fitur deteksi wajah yang ada di facebook untuk meningkatkan tingkat akurasi orang yang ada di foto


RUNNING ON FLASK

In [None]:
# validasi data inputan
def count_word(text):
   return len(text.split())

In [None]:
port_no = 5000

In [None]:
from flask import Flask
from pyngrok import ngrok

In [None]:
from flask import url_for, request
from flask.templating import render_template
template_folder = "/content/gdrive/MyDrive/NLP/f_zero/templates"
static_folder = "/content/gdrive/MyDrive/NLP/f_zero/static"
app = Flask(__name__, template_folder=template_folder, static_folder=static_folder)
ngrok.set_auth_token("2TI0JTA7MxqbjwsmutMmvwUshCV_4b51PZvttuYQYFpur9FJp")
public_url = ngrok.connect(port_no).public_url

@app.route("/index", methods=["POST", "GET"])
def index():
  output = ""
  v_textarea = ""
  language = ""
  n_words = 0
  if request.method == 'POST':
     input_text = request.form['Textarea']
     language = request.form['select1']
     # validasi jumlah kata minimal 100
     n_words = count_word(input_text)
     if n_words <= 100:
        output = "*Input sentences are too concise"
     elif n_words > 3000:
        output = "*Input more than 500 words"
     else:
        if language == "english" :
          output = summy_en(input_text)
        elif language == "indonesia" :
          output = summy_id(input_text)
        else:
          output = "bahasa tidak terdeteksi"
     v_textarea = request.form['Textarea']
  return render_template("index.html", output=output, v_textarea = v_textarea)

print(f"To access the global link please click {public_url}")


app.run(port=port_no)





To access the global link please click https://d447-34-132-58-78.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:50:42] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:50:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:50:49] "GET /index HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:50:50] "GET /static/people.png HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:50:50] "GET /static/index.css HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:53:07] "POST /index HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:53:08] "[36mGET /static/index.css HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:53:08] "[36mGET /static/people.png HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:57:23] "POST /index HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Sep/2023 03:57:23] "[36mGET /static/index.css HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 -