<a href="https://colab.research.google.com/github/freamwork97/TIS/blob/main/%EC%A2%8B%EC%9D%80%EA%B8%80%EA%B7%80%EC%A0%9C%EA%B3%B5%ED%95%98%EB%8A%94gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import requests
from bs4 import BeautifulSoup


# 크롤링할 URL
url = 'https://www.goodreads.com/quotes/tag/{}?page={}'

# 크롤링할 태그
tag = 'inspirational'

# 크롤링할 페이지 수
pages = 5

# 가져올 데이터를 저장할 빈 리스트 생성
quotes = []

# 페이지 수만큼 반복하여 데이터 수집
for page in range(1, pages+1):
    # 페이지 URL 생성
    page_url = url.format(tag, page)
    
    # requests 모듈을 사용하여 페이지 요청
    response = requests.get(page_url)
    
    # BeautifulSoup 모듈을 사용하여 HTML 파싱
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 인용구 데이터 추출
    quote_divs = soup.find_all('div', attrs={'class': 'quote'})
    
    # 각 인용구에서 필요한 데이터 추출하여 리스트에 추가
    for quote_div in quote_divs:
        text = quote_div.find('div', attrs={'class': 'quoteText'}).text.strip()
        author = quote_div.find('span', attrs={'class': 'authorOrTitle'}).text.strip()
        likes_elem = quote_div.find('a', attrs={'class': 'smallText', 'href': '#'})
        if likes_elem is not None:
          likes = likes_elem.text.strip()
        else:
          likes = 0

        quotes.append((text, author, likes))

In [4]:
# 불용어 및 특수문자 제거, 토큰화
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # 소문자로 변환
    text = text.lower()
    # 특수문자 제거
    text = re.sub(r'[^\w\s]', '', text)
    # 불용어 제거
    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    return text

# 전처리 적용
quotes_clean = [(preprocess(text), author, likes) for text, author, likes in quotes]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# GPT-2 모델 및 토크나이저 불러오기
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# tokenizer 초기화
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# padding token 추가
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# 학습 데이터 토큰화
inputs = tokenizer('\n'.join([text for text, _, _ in quotes_clean]), return_tensors='pt', padding=True, truncation=True)

# 모델 학습
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
for epoch in range(5):
    loss = 0
    for i in range(0, inputs['input_ids'].size(0), 64):
        input_ids = inputs['input_ids'][i:i+64]
        attention_mask = inputs['attention_mask'][i:i+64]
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss += outputs.loss.item()
        optimizer.zero_grad()
        outputs.loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1} loss: {loss / (i+1)}')

Epoch 1 loss: 6.657120704650879
Epoch 2 loss: 6.363989353179932
Epoch 3 loss: 6.093652248382568
Epoch 4 loss: 5.910400867462158
Epoch 5 loss: 5.711087703704834


In [9]:
import gradio as gr

def generate_quote(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, repetition_penalty=1.5, length_penalty=1.0)
    generated_quote = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_quote

input = gr.inputs.Textbox(label="Input text")
output = gr.outputs.Textbox(label="Output text")
title = '영어로 단어나 문장의 일부를 적어주세요 좋은 글귀를 적어줄게요'
Demo = gr.Interface(fn = generate_quote, inputs = input, outputs = output, title = title)
Demo.launch()



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

