In [1]:
text_data = ["   Interrobang. By Aishwarya Henriette       ", "Parking And Going. By Karl Gautier","   Today Is The night. By Jarek Prakash"]

In [2]:
strip_whitespace = [string.strip() for string in text_data]

strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [3]:
remove_periods = [string.replace(".","") for string in strip_whitespace]

remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [4]:
def capitalizer(string : str) -> str:
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [5]:
import re

def replace_letters_with_X(string : str) -> str:
    return re.sub(r"[a-zA-Z]","X",string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

### HTML 파상과 정제하기

In [7]:
from bs4 import BeautifulSoup

html = """
       <div class = 'full_name'><span style = 'font-weight:bold'>
       Masego</span> Azra</div>"
       """

soup = BeautifulSoup(html,'lxml')
soup.find("div",{"class" : "full_name" }).text

'\n       Masego Azra'

### 구두점 삭제하기

In [8]:
import unicodedata
import sys

text_data = ['Hi!!! I. Love. This. Song....',
            '10000% Agree!!!! #LoveIT',
            'Right?!?!']

punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

### 텍스트 토큰화하기

In [9]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

string = "The science of today is the technology  of tomorrow"

word_tokenize(string)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jlee0\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [10]:
from nltk.tokenize import sent_tokenize

string = "The science of today is the technology of tomorrow. Tomorrow is today."

sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

### 불용어 삭제하기

In [11]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

tokenized_words = ['i','am','going','to','go','to','the','store','and','park']

stop_words = stopwords.words('english')

[word for word in tokenized_words if word not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jlee0\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['going', 'go', 'store', 'park']

In [12]:
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [13]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

len(ENGLISH_STOP_WORDS), len(stop_words)

(318, 179)

In [14]:
list(ENGLISH_STOP_WORDS)[:5]

['indeed', 'neither', 'otherwise', 'five', 'seemed']

### 어간 추출하기

In [15]:
from nltk.stem.porter import PorterStemmer

tokenized_words = ['i','am','humbled','by','this','traditional','meeting']

porter = PorterStemmer()

[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

### 품사 태깅하기

In [17]:
import nltk
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag
from nltk import word_tokenize

text_data = "Chris loved outdoor running"

text_tagged = pos_tag(word_tokenize(text_data))

text_tagged

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jlee0\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [18]:
[word for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS']]

['Chris']

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer

tweets = ['I am eating a burrito for breakfast','Political science is an amazing field','San Francisco is an awesome city']

tagged_tweets = []

for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
    
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [20]:
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [21]:
import nltk
nltk.download('brown')

from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

sentences = brown.tagged_sents(categories = 'news')

train = sentences[:4000]
test = sentences[4000:]

unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff = unigram)
trigram = TrigramTagger(train, backoff = bigram)

trigram.evaluate(test)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\jlee0\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


0.8174734002697437

In [25]:
from konlpy.tag import Okt

okt = Okt()

text = '태양계는 지금으로부터 약 46억 년 전, 거대한 분자 구름의 일부분이 중력붕괴를 일으키면서 형성되었다'

okt.pos(text)

JVMNotFoundException: No JVM shared library file (jvm.dll) found. Try setting up the JAVA_HOME environment variable properly.

### 텍스트를 BoW로 인코딩하기

In [26]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(['I love Brazil. Brazil!','Sweden is best','Germany beats both'])

count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

bag_of_words

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [27]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [28]:
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [29]:
count_2gram = CountVectorizer(ngram_range = (1,2),stop_words = "english",vocabulary = ['brazil'])
bag = count_2gram.fit_transform(text_data)

bag.toarray()

array([[2],
       [0],
       [0]], dtype=int64)

In [30]:
count_2gram.vocabulary_

{'brazil': 0}

### 단어 중요도에 가중치 부여하기

In [31]:
import numpy as np
from sklearn.feature_extraction.text import TfidVectorizer

text_data = np.array(['I love Brazil. Brazil!','Sweden is best','Germany beats both'])

tfidf = TfidVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

feature_matrix

ImportError: cannot import name 'TfidVectorizer' from 'sklearn.feature_extraction.text' (C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py)