# NLP Basic Assignment
## 과제 : spam.csv를 활용하여 유의미한 해석을 도출해주세요!

In [1]:
import pandas as pd

## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다.
- 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
spam = pd.read_csv('/content/drive/MyDrive/대외활동/동아리/투빅스/7주차/spam.csv')

In [4]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

## Tokenizing


In [5]:
import nltk

In [6]:
import re
def remove(x) :
    r_x = re.sub('[^a-zA-Z]', ' ', x).lower() # 특수문자 제거 & 소문자
    return r_x

spam.v2 = spam.v2.apply(remove)

특수문자 없애고 대문자 -> 소문자

In [7]:
# 예시 코드 코드
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
english_stops = list(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
token = []

for sentence in spam.v2 :
    words = word_tokenize(sentence)
    words = [word for word in words if word not in english_stops]
    token.append(words)

spam.v2 = token

In [9]:
spam.v2

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, wkly, comp, win, fa, cup, final,...
3           [u, dun, say, early, hor, u, c, already, say]
4          [nah, think, goes, usf, lives, around, though]
                              ...                        
5567    [nd, time, tried, contact, u, u, pound, prize,...
5568                      [b, going, esplanade, fr, home]
5569                            [pity, mood, suggestions]
5570    [guy, bitching, acted, like, interested, buyin...
5571                                   [rofl, true, name]
Name: v2, Length: 5572, dtype: object

## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram 등이 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv2 = CountVectorizer(max_features= 500, stop_words = 'english', lowercase= False)
corpus_spam = sum(spam[spam.v1 == 'spam']['v2'], [])
cv_spam = cv2.fit_transform(corpus_spam)

count_spam = pd.DataFrame( {'word' : cv2.get_feature_names_out(),
                           'count' : cv_spam.sum(axis = 0).flat})
count_spam.sort_values('count', ascending = False).head(10)

Unnamed: 0,word,count
147,free,228
436,txt,170
446,ur,144
252,mobile,129
411,text,126
392,stop,126
65,claim,113
343,reply,104
486,www,98
320,prize,93


In [16]:
import gensim
from gensim.models import Word2Vec

model = Word2Vec(spam[spam['v1'] == 'spam']['v2'], min_count=5, vector_size=100, window=10, epochs=200, sg=0)
print(model.wv.most_similar(positive=["free"], topn=10))
x1=model.wv.most_similar(positive=["free"], topn=10)

[('latest', 0.4819677770137787), ('gr', 0.40249165892601013), ('keep', 0.36312538385391235), ('subscription', 0.3610702157020569), ('plus', 0.3576658368110657), ('sport', 0.33628034591674805), ('mths', 0.33177369832992554), ('get', 0.3299316465854645), ('minutes', 0.3289753794670105), ('ringtone', 0.3228863477706909)]


In [17]:
model = Word2Vec(spam[spam['v1'] == 'ham']['v2'], min_count=5, vector_size=100, window=10, epochs=200, sg=0)
print(model.wv.most_similar(positive=["free"], topn=10))
x2=model.wv.most_similar(positive=["free"], topn=10)

[('asks', 0.3223883807659149), ('coz', 0.30209606885910034), ('customer', 0.2760140597820282), ('ready', 0.26069870591163635), ('yahoo', 0.24567151069641113), ('lovable', 0.24316731095314026), ('reach', 0.24032795429229736), ('yoga', 0.23669008910655975), ('com', 0.23275741934776306), ('simple', 0.22920528054237366)]


In [18]:
model = Word2Vec(spam[spam['v1'] == 'spam']['v2'], min_count=5, vector_size=100, window=10, epochs=200, sg=1)
print(model.wv.most_similar(positive=["free"], topn=10))
x3=model.wv.most_similar(positive=["free"], topn=10)

[('get', 0.3978957235813141), ('plus', 0.3810008466243744), ('price', 0.3728260397911072), ('tariffs', 0.36339619755744934), ('stoptxt', 0.3602801561355591), ('december', 0.3463535010814667), ('update', 0.3440200090408325), ('mths', 0.337628036737442), ('texts', 0.32771432399749756), ('half', 0.3209421932697296)]


In [19]:
model = Word2Vec(spam[spam['v1'] == 'ham']['v2'], min_count=5, vector_size=100, window=10, epochs=200, sg=1)
print(model.wv.most_similar(positive=["free"], topn=10))
x4=model.wv.most_similar(positive=["free"], topn=10)

[('boost', 0.42181962728500366), ('places', 0.39310282468795776), ('hv', 0.3930509686470032), ('fren', 0.38112595677375793), ('workin', 0.36623895168304443), ('pilates', 0.36611199378967285), ('discuss', 0.3642219305038452), ('spoke', 0.3578820526599884), ('kb', 0.3569021224975586), ('power', 0.3523258566856384)]


## 본인이 도출해낸 해석을 적어주세요!

- 유사도, Wordcloud, 이진 분류 모델, Plot 뭐든 상관없으니 분명하고 인상적인 해석을 적어주시면 됩니다.

In [20]:
data = {'cbow_spam' : x1, 'skipgram_spam' : x3}
result = pd.DataFrame.from_dict(data, orient='index')
result = result.transpose()
result

Unnamed: 0,cbow_spam,skipgram_spam
0,"(latest, 0.4819677770137787)","(get, 0.3978957235813141)"
1,"(gr, 0.40249165892601013)","(plus, 0.3810008466243744)"
2,"(keep, 0.36312538385391235)","(price, 0.3728260397911072)"
3,"(subscription, 0.3610702157020569)","(tariffs, 0.36339619755744934)"
4,"(plus, 0.3576658368110657)","(stoptxt, 0.3602801561355591)"
5,"(sport, 0.33628034591674805)","(december, 0.3463535010814667)"
6,"(mths, 0.33177369832992554)","(update, 0.3440200090408325)"
7,"(get, 0.3299316465854645)","(mths, 0.337628036737442)"
8,"(minutes, 0.3289753794670105)","(texts, 0.32771432399749756)"
9,"(ringtone, 0.3228863477706909)","(half, 0.3209421932697296)"


In [21]:
data = {'cbow_ham' : x2, 'skipgram_ham' : x4}
result = pd.DataFrame.from_dict(data, orient='index')
result = result.transpose()
result

Unnamed: 0,cbow_ham,skipgram_ham
0,"(asks, 0.3223883807659149)","(boost, 0.42181962728500366)"
1,"(coz, 0.30209606885910034)","(places, 0.39310282468795776)"
2,"(customer, 0.2760140597820282)","(hv, 0.3930509686470032)"
3,"(ready, 0.26069870591163635)","(fren, 0.38112595677375793)"
4,"(yahoo, 0.24567151069641113)","(workin, 0.36623895168304443)"
5,"(lovable, 0.24316731095314026)","(pilates, 0.36611199378967285)"
6,"(reach, 0.24032795429229736)","(discuss, 0.3642219305038452)"
7,"(yoga, 0.23669008910655975)","(spoke, 0.3578820526599884)"
8,"(com, 0.23275741934776306)","(kb, 0.3569021224975586)"
9,"(simple, 0.22920528054237366)","(power, 0.3523258566856384)"


- 가장 빈도수가 높은 free에 대해 단어들의 유사도를 파악한 결과, 생각보다 유사도가 높지 않음
- spam의 경우에는 시간, 가격, 혜택과 관련된 얘기가 많다면, ham의 경우에는 홍보보다는 설명하는 느낌의 단어들이 많음