In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# 말뭉치 => 사전 생성
corpus = ["you know I want your love", 
          "I like you",
          "what should I do", 
          "I hate you", 
          "smile face"]

In [3]:
cv = CountVectorizer()

In [4]:
# 1. 사전 만들기 => fit()
# 2. 사전에 들어 있는 id로 인코딩 => transform()
# 3. 1, 2를 함께 처리하고자 하는 경우 => fir_transform()

In [5]:
cv.fit_transform(corpus) # sparse matrix(희소행렬) 만들기

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [6]:
cv.fit_transform(corpus).toarray()

array([[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

In [7]:
cv.vocabulary_

{'do': 0,
 'face': 1,
 'hate': 2,
 'know': 3,
 'like': 4,
 'love': 5,
 'should': 6,
 'smile': 7,
 'want': 8,
 'what': 9,
 'you': 10,
 'your': 11}

In [8]:
tfidf = TfidfVectorizer()

In [9]:
tfidf.fit(corpus)

TfidfVectorizer()

In [10]:
tfidf = tfidf.transform(corpus)
tfidf

<5x12 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [11]:
tfidf.toarray()

array([[0.        , 0.        , 0.        , 0.47412465, 0.        ,
        0.47412465, 0.        , 0.        , 0.47412465, 0.        ,
        0.3175268 , 0.47412465],
       [0.        , 0.        , 0.        , 0.        , 0.83088075,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.55645052, 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.57735027,
        0.        , 0.        ],
       [0.        , 0.        , 0.83088075, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.55645052, 0.        ],
       [0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        ]])

In [12]:
sentence = ["i like like like hate smile"]

In [13]:
nd2array = cv.transform(sentence).toarray()
nd2array # 하나여도 2차원

array([[0, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0]])

In [14]:
nd2array[0] # 꺼내려면 인덱싱 / 슬라이싱 필요

array([0, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0])

In [15]:
tfidf = TfidfVectorizer()
nd2array = tfidf.fit_transform(sentence).toarray()
nd2array # 하나여도 2차원

array([[0.30151134, 0.90453403, 0.30151134]])

In [16]:
cv.inverse_transform(nd2array)

[array(['do', 'face', 'hate'], dtype='<U6')]

In [17]:
tfidf.inverse_transform(nd2array)

[array(['hate', 'like', 'smile'], dtype='<U5')]

In [18]:
feature_list = cv.get_feature_names() # index없이 column 값만 추출 => "사전에 무엇이 들어있는지 확인"
feature_list



['do',
 'face',
 'hate',
 'know',
 'like',
 'love',
 'should',
 'smile',
 'want',
 'what',
 'you',
 'your']

In [19]:
# dic <--- list
dictionary = dict()
for index, element in enumerate(feature_list):
  dictionary[index] = element
dictionary

{0: 'do',
 1: 'face',
 2: 'hate',
 3: 'know',
 4: 'like',
 5: 'love',
 6: 'should',
 7: 'smile',
 8: 'want',
 9: 'what',
 10: 'you',
 11: 'your'}

In [20]:
## 스팸을 나타내는 사전 만들기 (10개)
## 스팸이 아닌 사전 만들기 (10개)
## 광고!! 짱 좋은 판매 홍보전략입니다.!! => 스펨입니다.!!

In [39]:
spam_dict = ["advertise", "promotion", "sales", "hu", "special", "sale","member", "company"] #스팸차단 목록
ham_dict = ["order", "confirm", "check", "customer", "payment", "send", "general", "club"] # 스팸 차단 목록 아님

In [40]:
email = ['promotion !! hu good sales sale check payment']

In [None]:
#스팸인것처리

In [41]:
cv.fit(spam_dict)

CountVectorizer()

In [42]:
cv.vocabulary_

{'advertise': 0,
 'company': 1,
 'hu': 2,
 'member': 3,
 'promotion': 4,
 'sale': 5,
 'sales': 6,
 'special': 7}

In [43]:
result1=cv.transform(email).toarray()

In [None]:
#  스팸아닌거처리

In [44]:
cv.fit(ham_dict)

CountVectorizer()

In [45]:
cv.vocabulary_

{'check': 0,
 'club': 1,
 'confirm': 2,
 'customer': 3,
 'general': 4,
 'order': 5,
 'payment': 6,
 'send': 7}

In [46]:
result2=cv.transform(email).toarray()

In [47]:
## result1의 0이 아닌 갯수의합 , result2의 0이 아닌 갯수의합
result1_count =0
result2_count =0

In [48]:
for item in result1[0]:
  if(item !=0):
    result1_count = result1_count +1
result1_count


4

In [49]:
for item in result2[0]:
  if(item !=0):
    result2_count = result2_count +1
result2_count

2

In [50]:
spam_dict = ["광고", "홍보", "판매", "허경영", "특가", "세일","회원", "회사"]
ham_dict = ["주문", "확인", "검토", "고객", "결제", "배송", "일반", "동호회"]

In [51]:
email = ['광고 !! 홍보 판매 허경영 결제 주문 동호회']

In [52]:
cv.fit(spam_dict)
cv.vocabulary_
result3=cv.transform(email).toarray()

In [53]:
cv.fit(ham_dict)
cv.vocabulary_
result4=cv.transform(email).toarray()

In [54]:
result3_count =0
result4_count =0

In [55]:
for item in result3[0]:
  if(item !=0):
    result3_count = result3_count +1
result3_count


4

In [56]:
for item in result4[0]:
  if(item !=0):
    result4_count = result4_count +1
result4_count

3