In [None]:
# 50,000개의 영화 평론 데이터를 불러옴.
# https://ai.stanford.edu/~amaas/data/sentiment/에서 Internet movie database를 받은 후
# 아래의 프로그램을 수행함.
import tarfile
tar = tarfile.open('C://Users//pupub//Desktop//python prog//aclImdb_v1.tar.gz','r:gz') # gz파일이 저장된 경로로 변경
tar.extractall()

In [None]:
# 자료를 행렬 형태로 바꿈.
import pyprind
import pandas as pd
import os
basepath = 'C://Users//pupub//Desktop//aclImdb'
labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']    

In [None]:
# 자료의 순서를 임의로 뒤섞어 csv 파일로 저장. 
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('C://Users//pupub//Desktop//movie_data.csv', index = False, encoding='utf-8')

In [1]:
# 저장된 파일 불러와 확인함.
import pyprind
import pandas as pd
import os
df = pd.read_csv('C://Users//pupub//Desktop//movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [2]:
# 정보를 가지지 않은 것으로 판단 되는 것을 사전에 정리
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # text에서 <[^>]*>과 일치하는 데이터를 공백으로 바꾸는 명령어
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
    text)
    text = (re.sub('[\W]+', ' ', text.lower()) # 단어가 아닌 모든 기호는 공백으로 대체
    +' '.join(emoticons).replace('-', '')) # emoticons을 빈공간 뒤에 배치
    return text
df['review'] = df['review'].apply(preprocessor)
df['review']

0        in 1974 the teenager martha moxley maggie grac...
1        ok so i really like kris kristofferson and his...
2         spoiler do not read this if you think about w...
3        hi for all the people who have seen this wonde...
4        i recently bought the dvd forgetting just how ...
5        leave it to braik to put on a good show finall...
6        nathan detroit frank sinatra is the manager of...
7        to understand crash course in the right contex...
8        i ve been impressed with chavez s stance again...
9        this movie is directed by renny harlin the fin...
10       i once lived in the u p and let me tell you wh...
11       hidden frontier is notable for being the longe...
12       it s a while ago that i have seen sleuth 1972 ...
13       what is it about the french first they apparen...
14       this very strange movie is unlike anything mad...
15       i saw this movie on the strength of the single...
16       there are some great philosophical questions w.

In [3]:
# 단어를 분류
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [4]:
# 단어의 원뿌리로 재표현
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [5]:
# 정보가 없는 단어 제거
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs alot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pupub\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'alot']

In [None]:
# 영화에 대한 평가를 positive or negative로 분류
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values 
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],'vect__stop_words': [stop, None],'vect__tokenizer': [tokenizer,tokenizer_porter],'clf__penalty':['l1', 'l2'],'clf__C': [1.0, 10.0, 100.0]},
               {'vect__ngram_range': [(1,1)],'vect__stop_words': [stop, None],'vect__tokenizer': [tokenizer, tokenizer_porter],'vect__use_idf':[False],
                'vect__norm':[None], 'clf__penalty': ['l1', 'l2'],'clf__C': [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([('vect', tfidf),('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,scoring='accuracy',cv=5, verbose=1,n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train) 

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

In [None]:
# 검증데이터의 정밀도
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
# 초모수 선택 후 시험데이터에 적용한 결과
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
# out-of-core learning을 영화리뷰 데이터에 적용

# csv 형태인 자료가 online 자료라고 가정한 후 
# 자료의 cleaning과 bag-of-words 모형을 조합해 문서를 특성변수화함.
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
# 
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
           text, label = line[:-3], int(line[-2])
           yield text, label

In [None]:
#
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
#
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='C://Users//pupub//Desktop//movie_data.csv')

In [None]:
#
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
#
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
#
clf = clf.partial_fit(X_test, y_test)

In [6]:
# Latent Dirichlet Allocation
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(stop_words='english', max_df=.1, max_features=5000)#전체 단어중 비중이10%이상이면 제거
X=count.fit_transform(df['review'].values)

from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics=lda.fit_transform(X)
lda.components_.shape

(10, 5000)

In [7]:
n_top_words=5
feature_names=count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx+1))
    print(" ".join([feature_names[i]
                  for i in topic.argsort()[:-n_top_words -1:-1]]))

Topic 1:
horror original comedy black house
Topic 2:
worst minutes guy script money
Topic 3:
book dvd read version watched
Topic 4:
family performance father beautiful mother
Topic 5:
series episode tv kids comedy
Topic 6:
murder police wife john plays
Topic 7:
documentary camera effects audience sense
Topic 8:
music song songs musical role
Topic 9:
horror effects guy dead budget
Topic 10:
action war game fight american


In [8]:
horror=X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx+1))
    print(df['review'][movie_idx][:300],'...') # df에서 열이름이 review인 자료의 movie_idx 행에서 0~299 문자. 


Horror movie #1:
 spoilers extremely brutal police drama set in san francisco involving a sting operation that goes terribly wrong a cop det falon sam elliott mistakenly and savagely beats to death an undercover policeman winch mike watson thinking that he murdered his partner det sam levinson mike burstyn a partner ...

Horror movie #2:
this first rate western tale of the gold rush brings great excitement romance and james stewart to the screen the far country is the only one out of all five stewart mann westerns that is often overlooked stewart yet again puts a new look on the ever present personalities he had in the five stewart  ...

Horror movie #3:
the fourth of five westerns anthony mann did with james stewart this one involves a hard bitten cattleman named jeff webster who takes a cattle drive from wyoming to alaska via seattle he hooks up in seattle with his partners ben tatum walter brennan and rube morris jay c flippen that he has sent ah ...
