In [1]:
import pandas as pd

In [3]:
# 데이터 읽기
df = pd.read_csv('data-files/SMSSpamCollection', 
                 sep='\t', header=None, names=['label', 'message'])

In [4]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# 타겟 문자열을 숫자 데이터로 변환
df['label'] = df.label.map( {'ham': 0, 'spam': 1} )

In [14]:
# (문제 1-1) 모든 문자를 소문자로 전환
df['message']=df['message'].str.lower()
df

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,1,this is the 2nd time we have tried 2 contact u...
5568,0,will ü b going to esplanade fr home?
5569,0,"pity, * was in mood for that. so...any other s..."
5570,0,the guy did some bitching but i acted like i'd...


In [31]:
# (문제 1-2) 특수 문자 제거
df['message']=df['message'].str.replace('[^A-Za-z0-9\s]+','')
df

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,1,this is the 2nd time we have tried 2 contact u...
5568,0,will b going to esplanade fr home
5569,0,pity was in mood for that soany other suggest...
5570,0,the guy did some bitching but i acted like id ...


In [32]:
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [33]:
# !pip install nltk

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hoseo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
df['message'] = df['message'].apply(nltk.word_tokenize)

In [35]:
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[dun, say, so, early, hor, u, c, already, then..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [125]:
# (문제 1-3) 영문자 기준 불용어 제거

from nltk.corpus import stopwords


nltk.download('stopwords')

# 여기에 코드 작성

stopwords.words('english')[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hoseo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [42]:
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[dun, say, so, early, hor, u, c, already, then..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [61]:
# (문제 1-4) 어근 추출

from nltk.stem import PorterStemmer

# 여기에 코드 작성

stemmer = PorterStemmer()
df['message']=df['message'].apply(lambda words: [stemmer.stem(word) for word in words])
df['message']

0       [go, until, jurong, point, crazi, avail, onli,...
1                            [ok, lar, joke, wif, u, oni]
2       [free, entri, in, 2, a, wkli, comp, to, win, f...
3       [dun, say, so, earli, hor, u, c, alreadi, then...
4       [nah, i, dont, think, he, goe, to, usf, he, li...
                              ...                        
5567    [thi, is, the, 2nd, time, we, have, tri, 2, co...
5568                [will, b, go, to, esplanad, fr, home]
5569    [piti, wa, in, mood, for, that, soani, other, ...
5570    [the, guy, did, some, bitch, but, i, act, like...
5571                       [rofl, it, true, to, it, name]
Name: message, Length: 5572, dtype: object

In [62]:
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[dun, say, so, earli, hor, u, c, alreadi, then..."
4,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."


In [63]:
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


In [76]:
# (문제 2-1) 단순 빈도 기반 인코딩

from sklearn.feature_extraction.text import CountVectorizer

# 여기에 코드 작성

cv=CountVectorizer()
cv.fit(df['message'])
counts = cv.transform(df['message'])
print(counts.shape)    #5572행의 총 단어가 8169개 
# counts.toarray()     
print(counts)

(5572, 8150)
  (0, 1127)	1
  (0, 1322)	1
  (0, 1730)	1
  (0, 1732)	1
  (0, 2011)	1
  (0, 2230)	1
  (0, 3318)	1
  (0, 3370)	1
  (0, 3407)	1
  (0, 3854)	1
  (0, 4110)	1
  (0, 4255)	1
  (0, 5275)	1
  (0, 5618)	1
  (0, 7112)	1
  (0, 7479)	1
  (0, 7697)	1
  (0, 7907)	1
  (1, 4076)	1
  (1, 4290)	1
  (1, 5240)	1
  (1, 5272)	1
  (1, 7817)	1
  (2, 70)	1
  (2, 427)	1
  :	:
  (5570, 1759)	1
  (5570, 2474)	1
  (5570, 2742)	1
  (5570, 3087)	1
  (5570, 3130)	1
  (5570, 3237)	1
  (5570, 3459)	1
  (5570, 3541)	1
  (5570, 3805)	1
  (5570, 3854)	1
  (5570, 3922)	1
  (5570, 3969)	1
  (5570, 4378)	1
  (5570, 5032)	1
  (5570, 6569)	1
  (5570, 6578)	1
  (5570, 7091)	1
  (5570, 7218)	1
  (5570, 7516)	1
  (5570, 7736)	1
  (5571, 3969)	2
  (5571, 4954)	1
  (5571, 6097)	1
  (5571, 7218)	1
  (5571, 7348)	1


In [77]:
# print(counts)
counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [79]:
# (문제 2-2) 문서 빈도를 고려한 인코딩

from sklearn.feature_extraction.text import TfidfTransformer

# 여기에 코드 작성

tf=TfidfTransformer()
tf

tf.fit_transform(counts).toarray()
tf.idf_

array([8.5270765 , 8.93254161, 8.93254161, ..., 8.93254161, 8.93254161,
       8.93254161])

In [80]:
# 훈련 데이터 / 테스트 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

In [89]:
# (문제 3-1) NaiveBayes 알고리즘을 적용해서 문서 분류 예측 모델 만들기

from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

# 여기에 코드 작성

nb = MultinomialNB()
nb.fit(X_train,y_train)

predicted = nb.predict(X_test)

In [90]:
# (문제 3-2) 훈련 데이터와 테스트 데이터에 대해 모델의 예측 정확도를 평가하세요
import numpy as np

print(nb.score(X_train, y_train), nb.score(X_test, y_test))

0.9898284802552852 0.982078853046595


In [91]:
# 혼돈 행렬 표시

from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[478   4]
 [  6  70]]


In [99]:
import pandas as pd
import numpy as np

In [100]:
metadata_clean = pd.read_csv('data-files/metadata_clean.csv')
metadata_clean.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995


In [102]:
movies_metadata = pd.read_csv('data-files/movies_metadata.csv', low_memory=False)

movies_metadata.head()
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [103]:
# (문제 4) metadata_clean의 모든 컬럼과 movies_metadata의 id, overview 컬럼을 결합하세요

metadata_clean['overview'] = movies_metadata['overview']
metadata_clean['id'] = movies_metadata['id']
metadata_clean['overview'] = metadata_clean['overview'].fillna('')

In [104]:
metadata_clean

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862
...,...,...,...,...,...,...,...,...
45461,Subdue,"['drama', 'family']",90.0,4.0,1.0,0,Rising and falling between a man and woman.,439050
45462,Century of Birthing,['drama'],360.0,9.0,3.0,2011,An artist struggles to finish his work while a...,111109
45463,Betrayal,"['action', 'drama', 'thriller']",90.0,3.8,6.0,2003,"When one of her hits goes wrong, a professiona...",67758
45464,Satan Triumphant,[],87.0,0.0,0.0,1917,"In a small town live two brothers, one a minis...",227506
