# IMDB 영화평 감성분석

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("labeledTrainData.tsv",sep='\t',quoting=3)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [10]:
df.review[0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

### 전처리

In [5]:
# <br /> 태그는 공백으로 변환
# 현재 object이므로 string으로 먼저 변환해줘야 함.
df['review'] = df.review.str.replace('<br />',' ')

In [6]:
df.review[0]            # 제거 완료

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.  Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.  The actual feature film bit when it finally starts is only on f

In [7]:
# 구둣점, 숫자 제거 -> 영어 이외의 문자는 공백으로 변환
import re

# apply함수와 lambda 이용. A-Za-z이외의 것들은 모두 공백으로 처리하자
df['review'] = df.review.apply(lambda x : re.sub('[^A-Za-z]',' ',x))

In [8]:
df.review[0]   # 다 잘 없어짐

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for 

### 트레인 / 테스트 데이터셋 분리 및 텍스트 변환 

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.review, df.sentiment, stratify=df.sentiment, test_size=0.25, random_state=2021
)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18750,), (6250,), (18750,), (6250,))

* CountVectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer(stop_words='english',ngram_range=(1,2))
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)

In [14]:
X_train_cv.shape, X_test_cv.shape

((18750, 1384106), (6250, 1384106))

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
%time knn.fit(X_train_cv,y_train)

Wall time: 10.9 ms


KNeighborsClassifier()

In [16]:
from sklearn.metrics import accuracy_score
pred = knn.predict(X_test_cv)
accuracy_score(y_test, pred)

0.54736

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=300)
%time lr.fit(X_train_cv, y_train)

Wall time: 36 s


LogisticRegression(max_iter=300)

In [18]:
pred = lr.predict(X_test_cv)
accuracy_score(y_test, pred)

0.8864

- TfidfVectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
tvect.fit(X_train)
X_train_tf = tvect.transform(X_train)
X_test_tf = tvect.transform(X_test)

In [20]:
lr = LogisticRegression(max_iter=300)
%time lr.fit(X_train_tf, y_train)

pred_tf = lr.predict(X_test_tf)
accuracy_score(y_test, pred_tf)

Wall time: 15.3 s


0.87904

### 모델 저장하고 불러오기

In [21]:
import joblib

In [22]:
joblib.dump(tvect, 'model/imdb_tvect.pkl')      
# 피클(바이너리파일) 형태로 저장 -> 크기가 크므로 git에 올리지 말 것, gitignore에 넣기
joblib.dump(lr,'model/imdb_lr.pkl')

['model/imdb_lr.pkl']

In [23]:
del tvect
del lr

In [24]:
new_tvect = joblib.load('model/imdb_tvect.pkl')
new_lr = joblib.load('model/imdb_lr.pkl')

In [25]:
new_X_train_tf = new_tvect.transform(X_test)
new_pred = new_lr.predict(new_X_train_tf)
accuracy_score(y_test,new_pred)

0.87904

- Pipeline과 GridSearchCV를 이용한 하이퍼 파라미터 튜닝

In [26]:
from sklearn.pipeline import Pipeline

# 파이프라인 정의
pipeline = Pipeline([
    ('tvect',TfidfVectorizer(stop_words='english',ngram_range=(1,2))),
    ('lr',LogisticRegression())
])

# 파라미터 정의
params = {
    'tvect__max_df' : [100,300,500],
    'lr__C' : [1,10]
    }

In [27]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(
    pipeline, param_grid=params, cv = 3,
    scoring='accuracy', verbose=1, n_jobs=-1 
)

%time grid_pipe.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Wall time: 2min 4s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10], 'tvect__max_df': [100, 300, 700]},
             scoring='accuracy', verbose=1)

In [30]:
print(grid_pipe.best_score_)
print(grid_pipe.best_params_)

pred = grid_pipe.best_estimator_.predict(X_test)
print(accuracy_score(y_test,pred))

0.8835733333333334
{'lr__C': 10, 'tvect__max_df': 700}
0.88112


In [32]:
joblib.dump(grid_pipe,'model/imdb_pipe.pkl') # 모델저장

['model/imdb_pipe.pkl']

- SVM

In [12]:
from sklearn.svm import SVC

In [22]:
pipeline_svm = Pipeline([
    ('tvect',TfidfVectorizer(stop_words='english',ngram_range=(1,2))),
    ('svm',SVC(random_state=2021))
])

# 파라미터 정의
params_svm = {
    'tvect__max_df' : [100,300,500],
    'svm__C' : [2,6,10]
    }

In [23]:
grid_pipe_svm = GridSearchCV(
    pipeline_svm, param_grid=params_svm, cv = 3,
    scoring='accuracy', verbose=1, n_jobs=-1 
)

%time grid_pipe_svm.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [None]:
grid_pipe_svm.best_params_
grid_pipe_svm.best_score_

In [None]:
pred_svm = grid_pipe_svm.best_estimator_.predict(X_test)
accuracy_score(y_test,pred_svm)