# IMDB 영화평 감성분석(이진 분류)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
df.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


In [3]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t', quoting=3)      # 3: QUOTE_NONE
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [5]:
df.review[0][:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

- 텍스트 전처리

In [7]:
# <br /> 태그는 공백으로 변환
df.review = df.review.str.replace('<br />',' ')

In [8]:
# 구둣점, 숫자 제거 - 영문자가 아닌 글자는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]',' ').str.strip()
df.review[0][:1000]

  df.review = df.review.str.replace('[^A-Za-z]',' ').str.strip()


'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

- Train/Test dataset 분리

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review, df.sentiment, stratify=df.sentiment, random_state=2022
)
y_train.value_counts()

0    9375
1    9375
Name: sentiment, dtype: int64

In [10]:
y_test.value_counts()

0    3125
1    3125
Name: sentiment, dtype: int64

- CountVectorizer로 변환

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english')

In [12]:
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_train_cv.shape

(18750, 65213)

In [13]:
# 지금과 같은 방법으로 하면 틀림
cvect2 = CountVectorizer(stop_words='english')
X_test_cv = cvect2.fit_transform(X_test)
X_test_cv.shape

(6250, 41343)

In [14]:
def get_word(index, voca):
    for key, value in voca.items():
        if value == index:
            return key

get_word(100, cvect.vocabulary_), get_word(100, cvect2.vocabulary_)

('abilityof', 'abos')

In [15]:
# test dataset는 train dataset을 변환한 Vectorizer로 반드시 변환하여야 함
X_test_cv = cvect.transform(X_test)
X_test_cv.shape

(6250, 65213)

- 분류기 - LogisticRegression

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2022, max_iter=500)
lr.fit(X_train_cv, y_train)

LogisticRegression(max_iter=500, random_state=2022)

In [19]:
lr.score(X_test_cv, y_test)

0.86576

- CountVectorizer + ngram_range=(1,2)

In [21]:
cvect12 = CountVectorizer(stop_words='english', ngram_range=(1,2))

In [22]:
cvect12.fit(X_train)
%time X_train_cv12 = cvect12.transform(X_train)

Wall time: 13.7 s


In [23]:
X_test_cv12 = cvect12.transform(X_test)
X_train_cv12.shape, X_test_cv12.shape

((18750, 1386558), (6250, 1386558))

In [24]:
lr12 = LogisticRegression(max_iter=500, random_state=2022)
lr12.fit(X_train_cv12, y_train)
lr12.score(X_test_cv12, y_test)

0.87968

- 모델 저장하고 불러오기

In [26]:
import joblib

In [27]:
# 모델 저장
joblib.dump(cvect12, 'model/imdb_cvect12.pkl')
joblib.dump(lr12, 'model/imdb_cvect_lr12.pkl')

['model/imdb_cvect_lr12.pkl']

In [28]:
del cvect12, lr12

In [29]:
# 모델 로딩
cvect12 = joblib.load('model/imdb_cvect12.pkl')
lr12 = joblib.load('model/imdb_cvect_lr12.pkl')

- 실제 데이터로 검증

In [31]:
review = '''
\nWhat cinemas were made for. I wasn't expecting something quite as amazing as this, this was two and a half hours of incredible entertainment, drama, laughs, tears and action galore, there truly was something for everyone here.\nThis movie really has ignited my love for the franchise once again, and I am truly excited for what comes next.\n\nNostalgia, there seems to be a real thirst for it theses days, and there is a lot of it to enjoy here. There are plenty of moments throughout that will have you speechless, they really do break all of the rules here.\n\nThis is one incredibly good looking movie, dazzling special effects, dizzying action scenes, two and a half hours will literally fly by.\n\nIncredible, 10/10.\n
'''
review

"\n\nWhat cinemas were made for. I wasn't expecting something quite as amazing as this, this was two and a half hours of incredible entertainment, drama, laughs, tears and action galore, there truly was something for everyone here.\nThis movie really has ignited my love for the franchise once again, and I am truly excited for what comes next.\n\nNostalgia, there seems to be a real thirst for it theses days, and there is a lot of it to enjoy here. There are plenty of moments throughout that will have you speechless, they really do break all of the rules here.\n\nThis is one incredibly good looking movie, dazzling special effects, dizzying action scenes, two and a half hours will literally fly by.\n\nIncredible, 10/10.\n\n"

In [34]:
# string replace method는 정규표현식을 지원하지 않음
review.replace('[^A-Za-z]',' ')

"\n\nWhat cinemas were made for. I wasn't expecting something quite as amazing as this, this was two and a half hours of incredible entertainment, drama, laughs, tears and action galore, there truly was something for everyone here.\nThis movie really has ignited my love for the franchise once again, and I am truly excited for what comes next.\n\nNostalgia, there seems to be a real thirst for it theses days, and there is a lot of it to enjoy here. There are plenty of moments throughout that will have you speechless, they really do break all of the rules here.\n\nThis is one incredibly good looking movie, dazzling special effects, dizzying action scenes, two and a half hours will literally fly by.\n\nIncredible, 10/10.\n\n"

In [32]:
# 텍스트 전처리 - 영문자 이외는 공백처리
import re               # regular expression(정규 표현식)
clean_review = re.sub('[^A-Za-z]', ' ', review).strip()

In [33]:
clean_review

'What cinemas were made for  I wasn t expecting something quite as amazing as this  this was two and a half hours of incredible entertainment  drama  laughs  tears and action galore  there truly was something for everyone here  This movie really has ignited my love for the franchise once again  and I am truly excited for what comes next   Nostalgia  there seems to be a real thirst for it theses days  and there is a lot of it to enjoy here  There are plenty of moments throughout that will have you speechless  they really do break all of the rules here   This is one incredibly good looking movie  dazzling special effects  dizzying action scenes  two and a half hours will literally fly by   Incredible'

In [35]:
review_cv = cvect12.transform([clean_review])

In [36]:
review_cv.shape

(1, 1386558)

In [37]:
lr12.predict(review_cv)

array([1], dtype=int64)