In [1]:
import re
import joblib
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [4]:
df = pd.read_csv('../../../machine-Learning/00.data/IMDB/labeledTrainData.tsv',
                 header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [5]:
# <br /> 태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')
# 영어 이외의 문자는 공백으로 변환
df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [6]:
feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size=0.3, random_state=2021
)
X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [7]:
X_train[:3]

Unnamed: 0,review
1156,This film is deeply disappointing Not only t...
14991,As a fan of Notorious B I G I was looking f...
8157,This cult movie has two crazed lesbians ...


In [8]:
X_test[:3]

Unnamed: 0,review
13895,I was expecting a B Movie French musical Aft...
20903,Disappearance is about a couple who take thei...
8539,I noticed at once that this movie really wasn...


In [2]:
df_test = pd.read_csv('../static/data/imdb_test1.csv')
df_test.head(10)

Unnamed: 0,review,sentiment
0,My girlfriend and I were stunned by how bad t...,0
1,What do you expect when there is no script to...,0
2,This is a German film from that is somet...,0
3,Richard Tyler is a little boy who is scared o...,0
4,I run a group to stop comedian exploitation a...,0
5,I can watch a good gory film now and then I ...,0
6,I should admit first I am a huge fan of The D...,1
7,I got to see this film at a preview and was d...,1
8,Homeward Bound The Incredible Journey is...,1
9,Still Crazy has been compared to the Spinal T...,1


Case 1. CountVectorizer + LogisticRegression

In [9]:
pipeline1 = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'lr_clf__C': [1, 5, 10]
})

In [11]:
grid_pipe = GridSearchCV(pipeline1, param_grid=params, cv=3,
                         scoring='accuracy')
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

{'count_vect__max_df': 500, 'lr_clf__C': 1} 0.865257298702979


In [12]:
best_count_lr = grid_pipe.best_estimator_
pred_count_lr = best_count_lr.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_count_lr)

0.9654666666666667

In [13]:
joblib.dump(best_count_lr, '../static/model/pipeline_cl.pkl')

['../static/model/pipeline_cl.pkl']

Case 2. TfidfVectorizer + LogisticRegression

In [14]:
pipeline2 = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english',ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10))
])

params = {
    'tfidf_vect__ngram_range': [(1,1), (1,2)],
    'tfidf_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
}

In [15]:

%time pipeline2.fit(X_train.review, y_train)

Wall time: 36 s


Pipeline(steps=[('tfidf_vect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('lr_clf', LogisticRegression(C=10))])

In [16]:
grid_pipe = GridSearchCV(pipeline2, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  5.6min finished
{'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.8814858082002551


In [17]:
best_tfidf_lr = grid_pipe.best_estimator_
pred_tfidf_lr = best_tfidf_lr.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_tfidf_lr)

0.9692

In [18]:
joblib.dump(best_tfidf_lr, '../static/model/pipeline_tl.pkl')

['../static/model/pipeline_tl.pkl']

Case 3. CountVectorizer + SupportVectorMachine

In [19]:

pipeline3= Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('sv_clf', SVC())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'sv_clf__C': [0.1, 1, 10]
})

In [21]:
%time pipeline3.fit(X_train.review, y_train)

Wall time: 14min 49s


Pipeline(steps=[('count_vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('sv_clf', SVC())])

In [23]:
grid_pipe = GridSearchCV(pipeline3, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 177.7min finished
{'count_vect__max_df': 500, 'sv_clf__C': 10} 0.8576573933418166


In [24]:
best_count_sv = pipeline3
pred_count_sv = best_count_sv.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_count_sv)

0.9476

In [25]:
joblib.dump(best_count_sv, '../static/model/imdb_count_sv.pkl')

['../static/model/imdb_count_sv.pkl']

Case 4. TfidfVectorizer + SupportVectorMachine

In [None]:
pipeline4 = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('sv_clf', SVC())
])
params = ({
    'tfidf_vect__max_df': [100, 300, 500],
    'sv_clf__C': [0.1, 1, 10]
})

In [None]:

%time pipeline.fit(X_train.review, y_train)

In [None]:
grid_pipe = GridSearchCV(pipeline4, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1) #, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

In [None]:
best_tfidf_sv = pipeline4
pred_tfidf_sv = best_tfidf_sv.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_tfidf_sv)

In [None]:
#joblib.dump(best_tfidf_sv, '../static/model/imdb_tfidf_sv.pkl')

테스트

In [26]:
index = 4

In [27]:
test_data = []
test_data.append(df_test.iloc[index, 0])

In [35]:
review_string = '오늘도 잠이 오지만 달립니다'

In [36]:
test_data = []
test_data.append(review_string)

In [37]:

test_data  #=>타입 리스트

['오늘도 잠이 오지만 달립니다']

In [38]:
label = df_test.sentiment[index]
label

0

In [39]:
model_cl = joblib.load('../static/model/pipeline_cl.pkl')
model_cs = joblib.load('../static/model/imdb_count_sv.pkl')
model_tl = joblib.load('../static/model/pipeline_tl.pkl')
#model_ts = joblib.load('../static/model/imdb_tfidf_sv.pkl')

In [47]:
index = 2
test_data = []
test_data.append(df_test.iloc[index, 0])

In [48]:
pred_cl = model_cl.predict(test_data)
pred_cl[0]

0

In [49]:
pred_cl = model_cl.predict(test_data)
pred_cs = model_cs.predict(test_data)
pred_tl = model_tl.predict(test_data)
#pred_ts = model_ts.predict(test_data)

In [50]:
pred_cl[0], pred_cs[0], pred_tl[0]  #, pred_ts[0]  # => 모델 테스트 결과가 실제값과 동일하게 나오므로, 모델 성공!

(0, 0, 0)