<a href="https://colab.research.google.com/github/dau-J/pypi_textminor/blob/master/textminer_pro_oss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# textminer-pro: ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨ Î∂ÑÏÑù

## Ï†ïÌòÑÏÑ±

NLTK, scikit-learn, Sumy, langdetect

- Î∂àÏö©Ïñ¥ Ï†úÍ±∞ (remove_stopwords)
- ÌÇ§ÏõåÎìú Ï∂îÏ∂ú (extract_keywords)
- ÌÖçÏä§Ìä∏ ÏöîÏïΩ (summarize_text)
- Ïñ∏Ïñ¥ Í∞êÏßÄ (detect_language)

# Ìè¥Îçî ÏÉùÏÑ±

In [2]:
!rm -rf textminer_pro
!mkdir -p textminer_pro/textminer
!mkdir -p textminer_pro/tests
!mkdir -p textminer_pro/.github/workflows

# ÌïÑÏöî ÎùºÏù¥Î∏åÎü¨Î¶¨ ÏÑ§Ïπò

In [3]:
!pip install langdetect
!pip install sumy
!pip install twine



# cleaner.py

In [4]:
%%writefile textminer_pro/textminer/cleaner.py
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def remove_stopwords(text: str, lang='english'):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words(lang))
    filtered = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered)


Writing textminer_pro/textminer/cleaner.py


# keyword.py

In [5]:
%%writefile textminer_pro/textminer/keyword.py
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords(text: str, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
    sorted_words = sorted(scores, key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_words[:top_n]]

Writing textminer_pro/textminer/keyword.py


# summarizer.py

In [6]:
%%writefile textminer_pro/textminer/summarizer.py

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer


def summarize_text(text: str, ratio=0.2, language='english'):

    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    sentence_count = max(1, int(len(parser.document.sentences) * ratio))
    summary = summarizer(parser.document, sentence_count)
    return ' '.join(str(sentence) for sentence in summary)

Writing textminer_pro/textminer/summarizer.py


# detector.py

In [7]:
%%writefile textminer_pro/textminer/detector.py
from langdetect import detect

def detect_language(text: str):
    try:
        return detect(text)
    except:
        return "unknown"

Writing textminer_pro/textminer/detector.py


# __init__.py

In [8]:
%%writefile textminer_pro/textminer/__init__.py
from .cleaner import remove_stopwords
from .keyword import extract_keywords
from .summarizer import summarize_text
from .detector import detect_language

Writing textminer_pro/textminer/__init__.py


# README.md

In [9]:
%%writefile textminer_pro/README.md

# textminerpro

A Korean-friendly text preprocessing toolkit that supports:

- remove_stopwords(text, lang='en')
- extract_keywords(text, top_n=5)
- summarize_text(text, ratio=0.2)
- detect_language(text)

Writing textminer_pro/README.md


# setup.py

In [10]:
%%writefile textminer_pro/setup.py
from setuptools import setup, find_packages

setup(
    name='textminerpro-hyeonsung',
    version='0.0.1',
    author='Ï†ïÌòÑÏÑ±',
    author_email='2254784@donga.ac.kr',
    packages=find_packages(),
    install_requires=[
        'nltk',
        'scikit-learn',
        'sumy',
        'langdetect'
    ],
    description='Advanced text preprocessing package',
    long_description=open('README.md', encoding='utf-8').read(),
    long_description_content_type='text/markdown',
    url='https://github.com/dau-J/pypi_textminor.git',
    classifiers=[
        'Programming Language :: Python :: 3',
        'License :: OSI Approved :: MIT License',
        'Operating System :: OS Independent',
    ],
    python_requires='>=3.6',
)


Writing textminer_pro/setup.py


# test_cleaner.py

In [11]:
%%writefile textminer_pro/tests/test_cleaner.py
from textminer.cleaner import remove_stopwords
from nltk.tokenize import word_tokenize

def test_remove_stopwords():
    text = "This is a sample sentence"
    result = remove_stopwords(text)
    tokens = word_tokenize(result)

    assert "is" not in tokens
    assert "a" not in tokens
    assert "sample" in tokens
    assert "sentence" in tokens


Writing textminer_pro/tests/test_cleaner.py


# test_dector.py

In [12]:
%%writefile textminer_pro/tests/test_detector.py
from textminer.detector import detect_language

def test_detect_language():
    text = "This is an English sentence."
    lang = detect_language(text)
    assert lang == "en"


Writing textminer_pro/tests/test_detector.py


# bdist_wheel

In [13]:
%cd /content/textminer_pro
!python setup.py sdist bdist_wheel

/content/textminer_pro
running sdist
running egg_info
creating textminerpro_hyeonsung.egg-info
writing textminerpro_hyeonsung.egg-info/PKG-INFO
writing dependency_links to textminerpro_hyeonsung.egg-info/dependency_links.txt
writing requirements to textminerpro_hyeonsung.egg-info/requires.txt
writing top-level names to textminerpro_hyeonsung.egg-info/top_level.txt
writing manifest file 'textminerpro_hyeonsung.egg-info/SOURCES.txt'
reading manifest file 'textminerpro_hyeonsung.egg-info/SOURCES.txt'
writing manifest file 'textminerpro_hyeonsung.egg-info/SOURCES.txt'
running check
creating textminerpro_hyeonsung-0.0.1
creating textminerpro_hyeonsung-0.0.1/tests
creating textminerpro_hyeonsung-0.0.1/textminer
creating textminerpro_hyeonsung-0.0.1/textminerpro_hyeonsung.egg-info
copying files to textminerpro_hyeonsung-0.0.1...
copying README.md -> textminerpro_hyeonsung-0.0.1
copying setup.py -> textminerpro_hyeonsung-0.0.1
copying tests/test_cleaner.py -> textminerpro_hyeonsung-0.0.1/tests

# token-apiÎ•º ÏÇ¨Ïö©Ìï¥ÏÑú Î∞∞Ìè¨(toekn-apiÏóê api-key)

In [14]:
!pip install twine
!twine upload -u __token__ -p "pypi token ÏûêÎ¶¨ÏûÖÎãàÎã§ " dist/*


Uploading distributions to https://upload.pypi.org/legacy/
Uploading textminerpro_hyeonsung-0.0.1-py3-none-any.whl
[2K[35m100%[0m [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.3/6.3 kB[0m ‚Ä¢ [33m00:00[0m ‚Ä¢ [31m?[0m
[?25hUploading textminerpro_hyeonsung-0.0.1.tar.gz
[2K[35m100%[0m [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m5.6/5.6 kB[0m ‚Ä¢ [33m00:00[0m ‚Ä¢ [31m?[0m
[?25h
[32mView at:[0m
https://pypi.org/project/textminerpro-hyeonsung/0.0.1/


# nltk download

In [15]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# pytest

In [16]:
!pip install .
!pytest tests

Processing /content/textminer_pro
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: textminerpro-hyeonsung
  Building wheel for textminerpro-hyeonsung (setup.py) ... [?25l[?25hdone
  Created wheel for textminerpro-hyeonsung: filename=textminerpro_hyeonsung-0.0.1-py3-none-any.whl size=3158 sha256=732b0c3a7dffb4969ccbc1b23a5b3f144a4734f26c055e5d5d52b2a8db183930
  Stored in directory: /tmp/pip-ephem-wheel-cache-1l88noip/wheels/2a/d4/4e/7dd7c42528f76921e6532a65c2244e23707490a7d4ee6e166c
Successfully built textminerpro-hyeonsung
Installing collected packages: textminerpro-hyeonsung
Successfully installed textminerpro-hyeonsung-0.0.1
platform linux -- Python 3.11.13, pytest-8.3.5, pluggy-1.6.0
rootdir: /content/textminer_pro
plugins: langsmith-0.3.44, anyio-4.9.0, typeguard-4.4.2
collected 2 items                                                              [0m

tests/test_cleaner.py [32m.[0m[32m                                                

# Î∞∞Ìè¨Ìïú ÎùºÏù¥Î∏åÎü¨Î¶¨ Îã§Ïö¥Î°úÎìú

In [32]:
pip install -i https://test.pypi.org/simple/ textminerpro-hyeonsung

Looking in indexes: https://test.pypi.org/simple/


# ÌîÑÎ°úÍ∑∏Îû® ÎèôÏûë ÌÖåÏä§Ìä∏

In [33]:
from textminer import remove_stopwords, extract_keywords, summarize_text, detect_language

# ÌÖåÏä§Ìä∏Ïö© ÌÖçÏä§Ìä∏
text_ko = "oss ÏàòÏóÖÏù¥ ÎÑàÎ¨¥ Ï¶êÍ±∞Ïõ†ÏäµÎãàÎã§ ÍµêÏàòÎãò Í∑∏ÎèôÏïà Í≥†ÏÉùÌïòÏÖ®ÏäµÎãàÎã§"
text_en = "The OSS class was really enjoyable. Thank you professor for your efforts."
# ÌïúÍ∏Ä
print("KOREAN")
print("Î∂àÏö©Ïñ¥ Ï†úÍ±∞:", remove_stopwords(text_ko))
print("ÌÇ§ÏõåÎìú Ï∂îÏ∂ú:", extract_keywords(text_ko))
print("ÏöîÏïΩ:", summarize_text(text_ko))
print("Ïñ∏Ïñ¥ Í∞êÏßÄ:", detect_language(text_ko))

# ÏòÅÏñ¥
print("\nENGLISH")
print("Î∂àÏö©Ïñ¥ Ï†úÍ±∞:", remove_stopwords(text_en))
print("ÌÇ§ÏõåÎìú Ï∂îÏ∂ú:", extract_keywords(text_en))
print("ÏöîÏïΩ:", summarize_text(text_en))
print("Ïñ∏Ïñ¥ Í∞êÏßÄ:", detect_language(text_en))


KOREAN
Î∂àÏö©Ïñ¥ Ï†úÍ±∞: oss ÏàòÏóÖÏù¥ ÎÑàÎ¨¥ Ï¶êÍ±∞Ïõ†ÏäµÎãàÎã§ ÍµêÏàòÎãò Í∑∏ÎèôÏïà Í≥†ÏÉùÌïòÏÖ®ÏäµÎãàÎã§
ÌÇ§ÏõåÎìú Ï∂îÏ∂ú: ['oss', 'Í≥†ÏÉùÌïòÏÖ®ÏäµÎãàÎã§', 'ÍµêÏàòÎãò', 'Í∑∏ÎèôÏïà', 'ÎÑàÎ¨¥']
ÏöîÏïΩ: oss ÏàòÏóÖÏù¥ ÎÑàÎ¨¥ Ï¶êÍ±∞Ïõ†ÏäµÎãàÎã§ ÍµêÏàòÎãò Í∑∏ÎèôÏïà Í≥†ÏÉùÌïòÏÖ®ÏäµÎãàÎã§
Ïñ∏Ïñ¥ Í∞êÏßÄ: ko

ENGLISH
Î∂àÏö©Ïñ¥ Ï†úÍ±∞: OSS class really enjoyable . Thank professor efforts .
ÌÇ§ÏõåÎìú Ï∂îÏ∂ú: ['class', 'efforts', 'enjoyable', 'oss', 'professor']
ÏöîÏïΩ: Thank you professor for your efforts.
Ïñ∏Ïñ¥ Í∞êÏßÄ: en


# githup release Î∞è pypi release ÏûêÎèôÌôî ÏõåÌÅ¨ÌîåÎ°úÏö∞ ÏÑ§Ï†ï

In [37]:
%%writefile .github/workflows/release.yml
name: Build and Publish to PyPI + GitHub Release

on:
  push:
    tags:
      - 'v0.0.1'

jobs:
  deploy:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout code
      uses: actions/checkout@v3

    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.x'

    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install setuptools wheel twine

    - name: Build package
      run: |
        python setup.py sdist bdist_wheel

    - name: Publish to PyPI
      env:
        TWINE_USERNAME: __token__
        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
      run: |
        twine upload dist/*

    - name: Create GitHub Release
      uses: softprops/action-gh-release@v1
      with:
        files: |
          dist/*.whl
          dist/*.tar.gz

Overwriting .github/workflows/release.yml


In [41]:
!git branch
!git branch -m gh-pages master
!git push -u origin master --force

* [32mgh-pages[m
Enumerating objects: 52, done.
Counting objects: 100% (52/52), done.
Delta compression using up to 2 threads
Compressing objects: 100% (40/40), done.
Writing objects: 100% (51/51), 15.93 KiB | 1.99 MiB/s, done.
Total 51 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), done.[K
To https://github.com/dau-J/pypi_textminor.git
 + 9723352...3b57258 master -> master (forced update)
Branch 'master' set up to track remote branch 'master' from 'origin'.


In [42]:
!git add .
!git commit -m "Add GitHub Actions workflow for release"
!git push origin master

On branch master
Your branch is up to date with 'origin/master'.

nothing to commit, working tree clean
Everything up-to-date


In [43]:
!git tag v0.0.1
!git push origin v0.0.1

Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/dau-J/pypi_textminor.git
 * [new tag]         v0.0.1 -> v0.0.1


# MKDOCS Î¨∏ÏÑú ÏûêÎèôÏÉùÏÑ±

In [44]:
!mkdir -p textminer_pro/docs

In [45]:
!pip install mkdocs

!mkdocs new docs-site

!mv docs-site/docs/* textminer_pro/docs/
!mv docs-site/mkdocs.yml textminer_pro/
!rm -r docs-site

INFO    -  Creating project directory: docs-site
INFO    -  Writing config file: docs-site/mkdocs.yml
INFO    -  Writing initial docs: docs-site/docs/index.md


In [46]:
%%writefile mkdocs.yml
site_name: textminer-pro
repo_url: https://github.com/dau-J/pypi_textminor

nav:
  - Ìôà: index.md
  - ÏÇ¨Ïö©Î≤ï: usage.md

theme:
  name: readthedocs

Overwriting mkdocs.yml


In [47]:
!mkdir -p docs

In [48]:
%%writefile docs/index.md

# pypi_textminor
pypi ÎùºÏù¥Î∏åÎü¨Î¶¨ ÌÖçÏä§Ìä∏ÎßàÏù¥Îãù ÌîÑÎ°úÍ∑∏Îû®

textminer-pro
A Korean-friendly text preprocessing toolkit that supports:

remove_stopwords(text, lang='en')
extract_keywords(text, top_n=5)
summarize_text(text, ratio=0.2)
detect_language(text)

## pypiÌå®ÌÇ§ÏßÄ ÎßÅÌÅ¨ Î∞è ÏÇ¨ÏßÑ
pypiÌå®ÌÇ§ÏßÄ ÎßÅÌÅ¨ [https://pypi.org/project/textminerpro-hyeonsung/]
--------------------------------------------------------------------------------------------
![image](https://github.com/user-attachments/assets/1d0adada-88e6-43da-8bb1-c1be6be1c624)




## üìÅ ÌîÑÎ°úÏ†ùÌä∏ Ìè¥Îçî Íµ¨Ï°∞
------------------------------------------------------------------------------------------------

```
textminer_pro/
‚îú‚îÄ‚îÄ textminer/
‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îú‚îÄ‚îÄ cleaner.py
‚îÇ   ‚îú‚îÄ‚îÄ summarizer.py
‚îÇ   ‚îî‚îÄ‚îÄ detector.py
‚îÇ
‚îú‚îÄ‚îÄ tests/
‚îÇ   ‚îú‚îÄ‚îÄ test_cleaner.py
‚îÇ   ‚îî‚îÄ‚îÄ test_detector.py
‚îÇ
‚îú‚îÄ‚îÄ setup.py
‚îú‚îÄ‚îÄ README.md
‚îî‚îÄ‚îÄ .github/
    ‚îî‚îÄ‚îÄ workflows/
        ‚îî‚îÄ‚îÄ pypi.yml
```



### Í∏∞Îä•

* **ÌÖçÏä§Ìä∏ Ï†ïÏ†ú:** Î∂àÏö©Ïñ¥Î•º Ìö®Ïú®Ï†ÅÏúºÎ°ú Ï†úÍ±∞ÌïòÍ≥† Îã§Î•∏ ÌÖçÏä§Ìä∏ Ï†ïÍ∑úÌôî ÏûëÏóÖÏùÑ ÏàòÌñâÌï©ÎãàÎã§.
* **ÌÖçÏä§Ìä∏ ÏöîÏïΩ:** Í∏¥ ÌÖçÏä§Ìä∏Ïùò Í∞ÑÍ≤∞Ìïú ÏöîÏïΩÏùÑ ÏÉùÏÑ±Ìï©ÎãàÎã§.
* **Ïñ∏Ïñ¥ Í∞êÏßÄ:** Ï£ºÏñ¥ÏßÑ ÌÖçÏä§Ìä∏Ïùò Ïñ∏Ïñ¥Î•º ÏûêÎèôÏúºÎ°ú ÏãùÎ≥ÑÌï©ÎãàÎã§.

### ÏÑ§Ïπò

pipÎ•º ÌÜµÌï¥ TextMiner ProÎ•º ÏÑ§ÏπòÌï† Ïàò ÏûàÏäµÎãàÎã§:

```bash
pip install textminerpro-hyeonsung==0.0.1

Overwriting docs/index.md


In [49]:
%%writefile docs/usage.md
# ÏÇ¨Ïö©ÏòàÏãú

from textminer import remove_stopwords, extract_keywords, summarize_text, detect_language

text = "OSS ÏàòÏóÖÏùÄ Ï†ïÎßê Ï¶êÍ±∞Ïõ†ÏäµÎãàÎã§."

# Î∂àÏö©Ïñ¥ Ï†úÍ±∞
print(remove_stopwords(text, lang="english"))

# ÌÇ§ÏõåÎìú Ï∂îÏ∂ú
print(extract_keywords(text))

# ÌÖçÏä§Ìä∏ ÏöîÏïΩ
print(summarize_text(text))

# Ïñ∏Ïñ¥ Í∞êÏßÄ
print(detect_language(text))



Overwriting docs/usage.md


# Git Ïó∞Îèô Î∞è Github pages Ìò∏Ïä§ÌåÖ

In [53]:
!git branch
!git branch -m master gh-pages

* [32mgh-pages[m


In [54]:
!git remote remove origin

In [55]:
# 1. Git Ï¥àÍ∏∞Ìôî
!git init

# 2. ÏÇ¨Ïö©Ïûê Ï†ïÎ≥¥ Îì±Î°ù
!git config --global user.email "2254784@donga.ac.kr"
!git config --global user.name "dau-J"

# 3. remote Îì±Î°ù
!git remote add origin https://dau-J: github token ÏûêÎ¶¨ÏûÖÎãàÎã§ @github.com/dau-J/pypi_textminor.git

# 4. Î≥ÄÍ≤Ω ÌååÏùº add & commit
!git add .
!git commit -m "mkdocs deploy test"

# 5. github-pages Î∏åÎûúÏπòÎ°ú push
!git branch -M gh-pages
!git push -u origin

# 6. mkdocs Î∞∞Ìè¨
!mkdocs gh-deploy --force

Reinitialized existing Git repository in /content/textminer_pro/.git/
[gh-pages 2b1df95] mkdocs deploy test
 1 file changed, 56 insertions(+), 53 deletions(-)
 rewrite docs/index.md (86%)
fatal: The current branch gh-pages has no upstream branch.
To push the current branch and set the remote as upstream, use

    git push --set-upstream origin gh-pages

INFO    -  Cleaning site directory
INFO    -  Building documentation to directory: /content/textminer_pro/site
INFO    -  Documentation built in 0.10 seconds
INFO    -  Copying '/content/textminer_pro/site' to 'gh-pages' branch and pushing to GitHub.
Enumerating objects: 50, done.
Counting objects: 100% (50/50), done.
Delta compression using up to 2 threads
Compressing objects: 100% (44/44), done.
Writing objects: 100% (47/47), 2.67 MiB | 4.14 MiB/s, done.
Total 47 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 1 local object.[K
To https://github.com/dau-J/pypi_textminor.git
   e9791fd

In [57]:
!git branch
!git branch -m gh-pages master

* [32mmaster[m
fatal: A branch named 'master' already exists.


# Ìè¥Îçî ÏïïÏ∂ï Î∞è Îã§Ïö¥Î°úÎìú

In [30]:
# Ï†ÑÏ≤¥ Ìè¥Îçî ÏïïÏ∂ï (Ïòà: textminer_pro)
import shutil
shutil.make_archive('/content/textminer_pro', 'zip', '/content/textminer_pro')

'/content/textminer_pro.zip'

In [31]:
from google.colab import files
files.download('/content/textminer_pro.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>