In [1]:
#2021.06.24. THU
#Hankyeong

#00. 패키지 호출
import warnings
import numpy as np 
import pandas as pd 
from sklearn.datasets import fetch_20newsgroups 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

#00-1. warning message ignore
warnings.filterwarnings(action='ignore')

#07. Newsgroup 텍스트 데이터셋을 통해 긍정/부정 분류하기.  
#(1) 데이터셋 불러오기. 
news_raw = fetch_20newsgroups(subset='all', random_state=156)

#PLUS. news 데이터셋의 설명 확인하기. 
print(news_raw.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [2]:
#PLUS. news 데이터셋의 key 확인하기. 
news_raw.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
#PLUS. news 데이터셋의 텍스트 데이터 구조 확인하기. 
print(news_raw.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [4]:
#(2) news_raw 데이터셋 데이터프레임으로 변환하기. 
df_news_raw = pd.DataFrame({'text': news_raw.data,
                        'target': news_raw.target})
df_news_raw

Unnamed: 0,text,target
0,From: egreen@east.sun.com (Ed Green - Pixel Cr...,8
1,From: jlevine@rd.hydro.on.ca (Jody Levine)\nSu...,8
2,From: u95_dgold@vaxc.stevens-tech.edu\nSubject...,12
3,From: jca2@cec1.wustl.edu (Joseph Charles Achk...,10
4,From: jonathan@comp.lancs.ac.uk (Mr J J Trevor...,6
...,...,...
18841,From: brian@lpl.arizona.edu (Brian Ceccarelli ...,19
18842,From: d12751@tanus.oz.au (Jason Bordujenko)\nS...,3
18843,From: rwf2@ns1.cc.lehigh.edu (ROBERT WILLIAM F...,7
18844,From: bc@idx.com\nSubject: Request info on a m...,3


In [5]:
#MEMO. target 변수는 연속형이 아닌 카테고리 분류값임!
news_raw.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
#(3) 데이터셋의 결측값 확인하기. 
df_news_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18846 entries, 0 to 18845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    18846 non-null  object
 1   target  18846 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 221.0+ KB


In [7]:
#(4) target 변수의 분포 확인하기. 
pd.Series(news_raw.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [8]:
#(5) headers, footers, quotes를 제거해서 데이터셋 불러오기.  
news = fetch_20newsgroups(subset='all', random_state=156, remove=('headers','footers','quotes')) 

#(6) 데이터 프레임화 하기. 
df_news = pd.DataFrame({'text': news.data,
                        'target': news.target})
df_news

Unnamed: 0,text,target
0,\nIf your primary concern is protecting the pa...,8
1,I feel childish.\n\n\nWho mentioned dirtbikes?...,8
2,,12
3,\nGretzky averaged 2.69 pts/game\n\n\nCheck yo...,10
4,If anyone would like to get rid of their SegaC...,6
...,...,...
18841,\nThat's right. Everyone. Even infants who c...,19
18842,"G'day All,\n\nI was looking to build a Paralle...",3
18843,"ites:\n Yeah, and the cop couldn't catch me.....",7
18844,While rummaging through a box of old PC (5150)...,3


In [9]:
#PLUS. news 데이터셋의 텍스트 데이터 구조 확인하기. 
print(news.data[0])


If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, but not in a
small one).



In [10]:
#(7) train, test 데이터셋으로 분할하기. 
X_train, X_test, y_train, y_test = train_test_split(
    df_news['text'], df_news['target'], stratify=df_news['target'], test_size=0.2, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15076,), (3770,), (15076,), (3770,))

In [11]:
#08. 피쳐 벡터화 변환과 머신러닝 모델 학습, 예측, 평가하기. 
#    Case I. Counter Vectorizer + Logistic Regression 
#(1) Count Vectorizer 객체 생성하기. 
count_vect = CountVectorizer()

#(2) X_train 데이터셋을 넣어 학습하기. 
count_vect.fit(X_train)

#(3) transform() 메서드를 이용해 벡터화하기. 
X_train_cv = count_vect.transform(X_train)
X_test_cv  = count_vect.transform(X_test)

#(4) 산출물의 shape 확인하기.
X_train_cv.shape, X_test_cv.shape

((15076, 122346), (3770, 122346))

In [12]:
#(5) Logistic Regression 모델 설정하기. 
lr = LogisticRegression()

#(6) 모델 학습하기. 
lr.fit(X_train_cv, y_train)

LogisticRegression()

In [13]:
#(7) 모델의 예측, 평가하기. 
lr_pred = lr.predict(X_test_cv)
accuracy_score(y_test,lr_pred)

0.676657824933687

In [14]:
#PLUS. 모델의 하이퍼 파라미터 확인하기. 
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [15]:
#09. 피쳐 벡터화 변환과 머신러닝 모델 학습, 예측, 평가하기. 
#    Case II. TF-IDF Vectorzier + Logistic Regression 
#(1) tfid Vectorizer 객체 생성하기. 
tfid_vect = TfidfVectorizer()

#(2) X_train 데이터셋을 넣어 학습하기. 
tfid_vect.fit(X_train)

#(3) transform() 메서드를 이용해 벡터화하기. 
X_train_tf = tfid_vect.transform(X_train)
X_test_tf  = tfid_vect.transform(X_test)

#(4) 산출물의 shape 확인하기.
X_train_tf.shape, X_test_tf.shape

((15076, 122346), (3770, 122346))

In [16]:
#(5) Logistic Regression 모델 설정하기. 
lr = LogisticRegression()

#(6) 모델 학습하기. 
lr.fit(X_train_tf, y_train)

LogisticRegression()

In [17]:
#(7) 모델의 예측, 평가하기. 
lr_pred = lr.predict(X_test_tf)
accuracy_score(y_test,lr_pred)

0.7209549071618037

In [18]:
#09. 피쳐 벡터화 변환과 머신러닝 모델 학습, 예측, 평가하기. 
#    Case III. TF-IDF Vectorzier(with parameter) + Logistic Regression 
#(1) tfid Vectorizer 객체 생성하기. 
tfid_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

#(2) X_train 데이터셋을 넣어 학습하기. 
tfid_vect.fit(X_train)

#(3) transform() 메서드를 이용해 벡터화하기. 
X_train_tf = tfid_vect.transform(X_train)
X_test_tf  = tfid_vect.transform(X_test)

#(4) 산출물의 shape 확인하기.
X_train_tf.shape, X_test_tf.shape

((15076, 1181620), (3770, 1181620))

In [19]:
#(5) Logistic Regression 모델 설정하기. 
lr = LogisticRegression()

#(6) 모델 학습하기. 
lr.fit(X_train_tf, y_train)

LogisticRegression()

In [20]:
#(7) 모델의 예측, 평가하기. 
lr_pred = lr.predict(X_test_tf)
accuracy_score(y_test,lr_pred)

0.7371352785145888

In [21]:
#PLUS. Logistic Regression에도 하이퍼 파라미터를 부여할 경우, 
lr = LogisticRegression(C=10)
lr.fit(X_train_tf, y_train)
lr_pred = lr.predict(X_test_tf)
accuracy_score(y_test,lr_pred)

0.7615384615384615

In [22]:
#10. 피쳐 벡터화 변환과 머신러닝 모델 학습, 예측, 평가하기. 
#    Case IV. TF-IDF Vectorzier + Logistic Regression + GridsearchCV + Pipeline 
#(1) Pipeline 정의하기. 
pipeline = Pipeline([
    ('tfid_vect', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [23]:
#(2) 하이퍼 파라미터 설정하기. 
params = {
    'tfid_vect__ngram_range' : [(1,1), (1,2)],
    'tfid_vect__max_df'      : [300,700],
    'lr__C'                  : [1,10,50,100]
}

In [24]:
#(3) GridsearchCV로 모델 정의하기. 
gscv_pipe = GridSearchCV(
    pipeline, param_grid=params, cv=3,
    scoring='accuracy', verbose=1, n_jobs=-1
)

In [25]:
#(4) 모델 학습하기. 
%time gscv_pipe.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Wall time: 17min 11s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfid_vect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10, 50, 100],
                         'tfid_vect__max_df': [300, 700],
                         'tfid_vect__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy', verbose=1)

In [33]:
#(5) 모델의 예측 및 평가하기. 
gscv_pipe_fit = gscv_pipe.best_estimator_
gscv_pipe_pred = gscv_pipe_fit.predict(X_test)
accuracy_score(y_test,gscv_pipe_pred)

0.7474801061007957