In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
df.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


In [3]:
# 결측치 체크
df.isna().sum().sum()

0

In [4]:
# 중복여부 체크
df.review.nunique()

24904

In [5]:
# 중복 데이터 제거
df.drop_duplicates(subset=['review'], inplace=True)
df.shape

(24904, 3)

In [6]:
# <br /> 태그는 공백으로 변환
df.review = df.review.str.replace('<br />', ' ')

In [7]:
# 구둣점, 숫자 제거 - 영문자 이외의 문자는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]', ' ', regex=True)

In [8]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review.values, df.sentiment.values, stratify=df.sentiment.values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((19923,), (4981,), (19923,), (4981,))

In [24]:
# 텍스트 인코딩
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english')
train_data_features = cvect.fit_transform(df['review'])

In [25]:
train_data_features

<24904x72935 sparse matrix of type '<class 'numpy.int64'>'
	with 2199396 stored elements in Compressed Sparse Row format>

In [10]:
# train 과 test dataset의 변환후 사이즈가 동일해야 함
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((19923, 66641), (4981, 66641))

In [11]:
# 학습 및 평가
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023, max_iter=500)

In [13]:
lrc.fit(X_train_cv, y_train)

In [14]:
lrc.score(X_test_cv, y_test)

0.8813491266813893

In [15]:
# TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((19923, 66641), (4981, 66641))

In [16]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_tv, y_train)
lrc.score(X_test_tv, y_test)

0.8964063441076089

In [29]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_cv, y_train)

In [30]:
rfc.score(X_test_cv, y_test)

0.8630797028709094

In [32]:
# TfidVectorizer + Random Forest
tvect2 = TfidfVectorizer(stop_words='english')
tvect2.fit(X_train)
X_train_tv2 = tvect2.transform(X_train)
X_test_tv2 = tvect2.transform(X_test)
X_train_tv2.shape, X_test_tv2.shape         # TfidVectorizer로 먼저 사용하고 
rfc = RandomForestClassifier()
rfc.fit(X_train_tv2, y_train)               # Random Forest를 사용한다

In [33]:
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train_tv2, y_train)
lrc.score(X_test_tv2, y_test)

0.8964063441076089