# IMDB 영화평 감성 분석

In [1]:
import re
import joblib
import numpy as np 
import pandas as pd 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [4]:
df = pd.read_csv('../../../Machine-Learning/00.data/IMDB/labeledTrainData.tsv',
                 header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [6]:
# <br /> 태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')

In [7]:
# 영어 이외의 문자는 공백으로 변환
df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [8]:
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [9]:
df[df.review == ''].count()

id           0
sentiment    0
review       0
dtype: int64

In [10]:
feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size=0.25, random_state=2021
)
X_train.shape, X_test.shape

((18750, 1), (6250, 1))

In [12]:
X_train[:3]

Unnamed: 0,review
14475,There s no shortage of bad dialogue in David ...
22605,This film takes what could have been a good i...
17673,Bob Clampett s Porky s Poor Fish is a so so...


In [13]:
X_test[:3]

Unnamed: 0,review
13895,I was expecting a B Movie French musical Aft...
20903,Disappearance is about a couple who take thei...
8539,I noticed at once that this movie really wasn...


In [14]:
df_test = pd.DataFrame(X_test, columns=['review'])
df_test['sentiment'] = y_test
df_test.to_csv('../static/data/IMDB_test.csv', index=False)

In [15]:
df_test = pd.read_csv('../static/data/IMDB_test.csv')
df_test.head(3)

Unnamed: 0,review,sentiment
0,I was expecting a B Movie French musical Aft...,0
1,Disappearance is about a couple who take thei...,0
2,I noticed at once that this movie really wasn...,1


### Case 1. CountVectorizer + LogisticRegression

In [15]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'lr_clf__C': [1, 5, 10]
})

In [18]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  5.2min finished
{'count_vect__max_df': 500, 'lr_clf__C': 1} 0.8667199999999999


In [21]:
best_count_lr = grid_pipe.best_estimator_
pred_count_lr = best_count_lr.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_count_lr)

0.87344

In [22]:
joblib.dump(best_count_lr, '../static/model/imdb_count_lr.pkl')

['../static/model/imdb_count_lr.pkl']

### Case 2. CountVectorizer + SupportVectorMachine

In [13]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('sv_clf', SVC())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'sv_clf__C': [0.1, 1, 10]
})

In [14]:
%time pipeline.fit(X_train.review, y_train)

Wall time: 20min 22s


Pipeline(steps=[('count_vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('sv_clf', SVC())])

In [16]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [17]:
best_count_sv = pipeline
pred_count_sv = best_count_sv.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_count_sv)

0.87312

In [26]:
joblib.dump(best_count_sv, '../static/model/imdb_count_sv.pkl')

['../static/model/imdb_count_sv.pkl']

### Case 3. TfidfVectorizer + LogisticRegression

In [23]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'tfidf_vect__max_df': [100, 300, 500],
    'lr_clf__C': [0.1, 1, 10]
})

In [20]:
%time pipeline.fit(X_train.review, y_train)

Wall time: 40.4 s


Pipeline(steps=[('tfidf_vect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('lr_clf', LogisticRegression())])

In [24]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  9.5min finished
{'lr_clf__C': 10, 'tfidf_vect__max_df': 500} 0.8776533333333333


In [25]:
best_tfidf_lr = grid_pipe.best_estimator_
pred_tfidf_lr = best_tfidf_lr.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_tfidf_lr)

0.88144

In [27]:
joblib.dump(best_tfidf_lr, '../static/model/imdb_tfidf_lr.pkl')

['../static/model/imdb_tfidf_lr.pkl']

### Case 4. TfidfVectorizer + SupportVectorMachine

In [28]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('sv_clf', SVC())
])
params = ({
    'tfidf_vect__max_df': [100, 300, 500],
    'sv_clf__C': [0.1, 1, 10]
})

In [29]:
%time pipeline.fit(X_train.review, y_train)

Wall time: 25min 8s


Pipeline(steps=[('tfidf_vect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('sv_clf', SVC())])

In [None]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

In [30]:
best_tfidf_sv = pipeline
pred_tfidf_sv = best_tfidf_sv.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_tfidf_sv)

0.8832

In [31]:
joblib.dump(best_tfidf_sv, '../static/model/imdb_tfidf_sv.pkl')

['../static/model/imdb_tfidf_sv.pkl']

### 테스트

In [16]:
index = 1000

In [23]:
test_data = []
test_data.append(df_test.iloc[index, 0])

In [19]:
review_string = '''Really enjoyed this series. One reviewer gave a low rating mentioning how the first episode showed her using pills in the orphanage....yes they had to as it shapes the rest of her future. The characters are great and the acting on is superb. Kept me hooked!'''

In [29]:
test_data = []
test_data.append(review_string)

In [30]:
test_data

['Really enjoyed this series. One reviewer gave a low rating mentioning how the first episode showed her using pills in the orphanage....yes they had to as it shapes the rest of her future. The characters are great and the acting on is superb. Kept me hooked!']

In [22]:
label = df_test.sentiment[index]
label

1

In [24]:
model_cl = joblib.load('../static/model/imdb_count_lr.pkl')
model_cs = joblib.load('../static/model/imdb_count_sv.pkl')
model_tl = joblib.load('../static/model/imdb_tfidf_lr.pkl')
model_ts = joblib.load('../static/model/imdb_tfidf_sv.pkl')

In [31]:
pred_cl = model_cl.predict(test_data)

In [32]:
pred_cl[0]

1

In [33]:
pred_cl = model_cl.predict(test_data)
pred_cs = model_cs.predict(test_data)
pred_tl = model_tl.predict(test_data)
pred_ts = model_ts.predict(test_data)

In [34]:
pred_cl[0], pred_cs[0], pred_tl[0], pred_ts[0]

(1, 1, 1, 1)