## 원핫인코딩

In [1]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# 원핫 인코딩을 적용할 예제 데이터를 설정합니다
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
print(set(data))
values = array(data)
print(values)

{'warm', 'cold', 'hot'}
['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']


In [3]:
# Integer Encoding을 진행합니다.
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

[0 0 2 0 1 1 2 0 2 1]


In [4]:
# One-hot Encoding을 진행합니다.
onehot_encoder = OneHotEncoder(sparse_output=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
print(integer_encoded.reshape(len(integer_encoded), 1))
print(integer_encoded.shape)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0]
 [0]
 [2]
 [0]
 [1]
 [1]
 [2]
 [0]
 [2]
 [1]]
(10, 1)
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


## BOW

In [5]:
# 필요한 라이브러리와 예제데이터를 가져옵니다.
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")

# 예제로 사용할 text를 선언합니다.
text = """
The the quick brown fox jumped over the the lazy dog.
She she always always enjoys enjoys watching watching romantic romantic movies.
He he sang sang loudly loudly at at the top top of of his his lungs.
They they often often go go swimming swimming in in the the crystal crystal clear clear waters waters.
I I can't can't believe believe I I forgot forgot my my keys keys again again.
The the tall tall mountain mountain stood stood majestically majestically in in the the distance distance.
We we went went shopping shopping and and bought bought many many new new clothes clothes.
The the delicious delicious pizza pizza had had a a perfect perfect combination combination of of flavors flavors.
She she danced danced gracefully gracefully across across the the stage stage during during the the performance performance.
He He repeatedly repeatedly asked asked for for more more time time to to complete complete the the project project.
 """

In [6]:
# 문장으로 이루어진 리스트를 저장합니다.
sentences_lst = text.split('\n')
sentences_lst

['',
 'The the quick brown fox jumped over the the lazy dog.',
 'She she always always enjoys enjoys watching watching romantic romantic movies.',
 'He he sang sang loudly loudly at at the top top of of his his lungs.',
 'They they often often go go swimming swimming in in the the crystal crystal clear clear waters waters.',
 "I I can't can't believe believe I I forgot forgot my my keys keys again again.",
 'The the tall tall mountain mountain stood stood majestically majestically in in the the distance distance.',
 'We we went went shopping shopping and and bought bought many many new new clothes clothes.',
 'The the delicious delicious pizza pizza had had a a perfect perfect combination combination of of flavors flavors.',
 'She she danced danced gracefully gracefully across across the the stage stage during during the the performance performance.',
 'He He repeatedly repeatedly asked asked for for more more time time to to complete complete the the project project.',
 ' ']

In [7]:
# CountVectorizer를 변수에 저장합니다.
ctve = CountVectorizer()

# 어휘 사전을 생성합니다.
ctve.fit(sentences_lst)

In [8]:
dtm_count = ctve.transform(sentences_lst)
# 문장수, 단어수
print(dtm_count.shape)

(12, 69)


In [11]:
# vocabulary(모든 토큰)와 맵핑된 인덱스 정보를 확인
ctve.vocabulary_

{'the': 60,
 'quick': 50,
 'brown': 8,
 'fox': 24,
 'jumped': 31,
 'over': 45,
 'lazy': 33,
 'dog': 18,
 'she': 54,
 'always': 2,
 'enjoys': 20,
 'watching': 65,
 'romantic': 52,
 'movies': 40,
 'he': 28,
 'sang': 53,
 'loudly': 34,
 'at': 5,
 'top': 64,
 'of': 43,
 'his': 29,
 'lungs': 35,
 'they': 61,
 'often': 44,
 'go': 25,
 'swimming': 58,
 'in': 30,
 'crystal': 14,
 'clear': 10,
 'waters': 66,
 'can': 9,
 'believe': 6,
 'forgot': 23,
 'my': 41,
 'keys': 32,
 'again': 1,
 'tall': 59,
 'mountain': 39,
 'stood': 57,
 'majestically': 36,
 'distance': 17,
 'we': 67,
 'went': 68,
 'shopping': 55,
 'and': 3,
 'bought': 7,
 'many': 37,
 'new': 42,
 'clothes': 11,
 'delicious': 16,
 'pizza': 48,
 'had': 27,
 'perfect': 46,
 'combination': 12,
 'flavors': 21,
 'danced': 15,
 'gracefully': 26,
 'across': 0,
 'stage': 56,
 'during': 19,
 'performance': 47,
 'repeatedly': 51,
 'asked': 4,
 'for': 22,
 'more': 38,
 'time': 62,
 'to': 63,
 'complete': 13,
 'project': 49}

In [12]:
print(ctve.get_feature_names_out())
print(dtm_count.todense()) # numpy.matrix 타입으로 돌려줍니다.

['across' 'again' 'always' 'and' 'asked' 'at' 'believe' 'bought' 'brown'
 'can' 'clear' 'clothes' 'combination' 'complete' 'crystal' 'danced'
 'delicious' 'distance' 'dog' 'during' 'enjoys' 'flavors' 'for' 'forgot'
 'fox' 'go' 'gracefully' 'had' 'he' 'his' 'in' 'jumped' 'keys' 'lazy'
 'loudly' 'lungs' 'majestically' 'many' 'more' 'mountain' 'movies' 'my'
 'new' 'of' 'often' 'over' 'perfect' 'performance' 'pizza' 'project'
 'quick' 'repeatedly' 'romantic' 'sang' 'she' 'shopping' 'stage' 'stood'
 'swimming' 'tall' 'the' 'they' 'time' 'to' 'top' 'watching' 'waters' 'we'
 'went']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0

In [13]:
# 데이터프레임으로 가져옵니다.
dtm_count_df = pd.DataFrame(dtm_count.todense(), columns=ctve.get_feature_names_out())
dtm_count_df

Unnamed: 0,across,again,always,and,asked,at,believe,bought,brown,can,...,tall,the,they,time,to,top,watching,waters,we,went
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,4,0,0,0,0,0,0,0,0
2,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,2,0,0,0,0,...,0,1,0,0,0,2,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,2,2,0,0,0,0,2,0,0
5,0,2,0,0,0,0,2,0,0,2,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,0,0,0,0
7,0,0,0,2,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,2,2
8,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
9,2,0,0,0,0,0,0,0,0,0,...,0,4,0,0,0,0,0,0,0,0


## TF-IDF

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
IMDB = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/NLP/ENG/IMDB_Top10000.csv', index_col = 0)
IMDB.head()

Unnamed: 0,label,review
0,1,I rented I AM CURIOUS-YELLOW from my video sto...
1,1,"""I Am Curious: Yellow"" is a risible and preten..."
2,1,If only to avoid making this type of film in t...
3,1,This film was probably inspired by Godard's Ma...
4,1,"Oh, brother...after hearing about this ridicul..."


In [15]:
tfidf = TfidfVectorizer(stop_words='english', max_features=200)

dtm_tfidf = tfidf.fit_transform(IMDB['review'])

dtm_tfidf = pd.DataFrame(dtm_tfidf.todense(), columns=tfidf.get_feature_names_out())
dtm_tfidf

Unnamed: 0,10,acting,action,actor,actors,actually,american,audience,away,awful,...,women,work,world,worse,worst,worth,wrong,year,years,young
0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.113648,0.124645
1,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.263183,0.000000,0.000000,0.0,...,0.532178,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.211109,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
3,0.209794,0.162316,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.236374,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
4,0.000000,0.000000,0.0,0.0,0.084072,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.280076,0.102392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.141467,0.0,...,0.000000,0.000000,0.000000,0.0,0.121372,0.153471,0.0,0.0,0.000000,0.000000
9996,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.168923,0.000000,0.0,0.0,0.000000,0.000000
9997,0.000000,0.127514,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.163831,0.000000,0.0,0.000000,0.193990,0.0,0.0,0.000000,0.000000
9998,0.147932,0.114454,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
