# 20 News Group 분류

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', random_state=156)

In [2]:
train_news = fetch_20newsgroups(subset='train', random_state=156,
                                remove=('headers', 'footers', 'quotes'))

In [4]:
df_train = pd.DataFrame(train_news.data, columns=['data'])
df_train['target'] = train_news.target
df_train.isna().sum()

data      0
target    0
dtype: int64

In [5]:
df_train[df_train.data == ''].count()

data      218
target    218
dtype: int64

In [6]:
df_train = df_train.drop(df_train[df_train.data == ''].index)
df_train[df_train.data == ''].count()

data      0
target    0
dtype: int64

In [7]:
test_news = fetch_20newsgroups(subset='test', random_state=156,
                                remove=('headers', 'footers', 'quotes'))

In [8]:
df_test = pd.DataFrame(test_news.data, columns=['data'])
df_test['target'] = test_news.target
df_test.isna().sum()

data      0
target    0
dtype: int64

In [9]:
df_test[df_test.data == ''].count()

data      162
target    162
dtype: int64

In [10]:
df_test = df_test.drop(df_test[df_test.data == ''].index)
df_test[df_test.data == ''].count()

data      0
target    0
dtype: int64

In [11]:
X_train = df_train.data.values
y_train = df_train.target.values
X_test = df_test.data.values
y_test = df_test.target.values

In [13]:
X_train.shape, X_test.shape     # String의 array

((11096,), (7370,))

In [14]:
df_test.to_csv('../static/data/news/test.csv', index=False)

### 피처 벡터화 변환

- Case 1. CounterVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(X_train)
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

In [16]:
X_train_count.shape, X_test_count.shape

((11096, 101631), (7370, 101631))

In [19]:
count_vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

- Case 2. TfidfVectorizer

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [18]:
X_train_tfidf.shape, X_test_tfidf.shape

((11096, 101631), (7370, 101631))

In [20]:
tfidf_vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

### Pipeline 과 GridSearchCV 를 통한 하이퍼 파라미터 튜닝

- Case A. CountVecorizer + Logistic Regression Classifier

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [22]:
params = {
    'count_vect__ngram_range': [(1,1), (1,2)],
    'count_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
}

In [23]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 43.0min finished
{'count_vect__max_df': 300, 'count_vect__ngram_range': (1, 2), 'lr_clf__C': 1} 0.7059305880934986


In [24]:
best_count_lr = grid_pipe.best_estimator_
pred_count_lr = best_count_lr.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_count_lr)

0.662415196743555

- Case B. TfidfVecorizer + Logistic Regression Classifier

In [26]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])
params = {
    'tfidf_vect__ngram_range': [(1,1), (1,2)],
    'tfidf_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
}

In [27]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 32.4min finished
{'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.766132057480442


In [28]:
best_tfidf_lr = grid_pipe.best_estimator_
pred_tfidf_lr = best_tfidf_lr.predict(X_test)
accuracy_score(y_test, pred_tfidf_lr)

0.7150610583446404

- Case C. TfidfVecorizer + Support Vector Classifier

In [29]:
from sklearn.svm import SVC
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('sv_clf', SVC())
])
params = {
    'tfidf_vect__ngram_range': [(1,1), (1,2)],
    'tfidf_vect__max_df': [300, 700],
    'sv_clf__C': [1, 10]
}

In [30]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 12.6min finished
{'sv_clf__C': 10, 'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 1)} 0.7586518518323572


In [31]:
best_tfidf_sv = grid_pipe.best_estimator_
pred_tfidf_sv = best_tfidf_sv.predict(X_test)
accuracy_score(y_test, pred_tfidf_sv)

0.6957937584803257

### 모델 저장

In [32]:
import joblib
joblib.dump(best_count_lr, '../static/model/news_count_lr.pkl')
joblib.dump(best_tfidf_lr, '../static/model/news_tfidf_lr.pkl')
joblib.dump(best_tfidf_sv, '../static/model/news_tfidf_sv.pkl')

['../static/model/news_tfidf_sv.pkl']

### Test

In [31]:
import joblib
best_count_lr = joblib.load('../static/model/news_count_lr.pkl')
best_tfidf_lr = joblib.load('../static/model/news_tfidf_lr.pkl')
best_tfidf_sv = joblib.load('../static/model/news_tfidf_sv.pkl')

In [23]:
index = 7300
df = pd.read_csv('../static/data/news/test.csv')
df.tail()

Unnamed: 0,data,target
7365,"\nOil Pressure, Oil Temperature\nCoolant Tempe...",7
7366,"\nOh dear, time for me to try to remember my c...",12
7367,\n\n\n-- That means that there cannot be any a...,13
7368,s:\n I have a 1991 Toyota Camry Deluxe for sa...,6
7369,"May 13, 1993 _Five Russian soldiers sentenced...",17


In [24]:
label = df.target[index]
label

17

In [26]:
type(df.data[7000])

str

In [27]:
list('i am a boy')

['i', ' ', 'a', 'm', ' ', 'a', ' ', 'b', 'o', 'y']

In [37]:
# test data 만드는 방법 1
test_data = []
test_data.append(df.data[index])
test_data

['\n\nThe borders of the Jewish state as drawn by the U.N. included the areas which\ncontained mostly Jews,  that\'s what the surveys and the numerous commitees\nwhere after when they visited here.\n\n\nI never touched an Arab during my army service and never voted for anyone more\nright than the Green party.  Will I be spared by these "humanist standards"?\n(or will anyone stop to consider this before sloughtering me?)\n\nI doubt it.  And not only because of the past record of murdering helpless\nwomen and children since the turn of the century up to these days.\n']

In [30]:
# test_data 만드는 방법 2
test_data = df.iloc[index:index+1, 0].values
test_data

array(['\n\nThe borders of the Jewish state as drawn by the U.N. included the areas which\ncontained mostly Jews,  that\'s what the surveys and the numerous commitees\nwhere after when they visited here.\n\n\nI never touched an Arab during my army service and never voted for anyone more\nright than the Green party.  Will I be spared by these "humanist standards"?\n(or will anyone stop to consider this before sloughtering me?)\n\nI doubt it.  And not only because of the past record of murdering helpless\nwomen and children since the turn of the century up to these days.\n'],
      dtype=object)

In [32]:
pred = best_tfidf_sv.predict(test_data)

In [33]:
pred[0], label

(17, 17)

In [41]:
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [42]:
target_names = dict(zip(range(20),news_data.target_names))
target_names

{0: 'alt.atheism',
 1: 'comp.graphics',
 2: 'comp.os.ms-windows.misc',
 3: 'comp.sys.ibm.pc.hardware',
 4: 'comp.sys.mac.hardware',
 5: 'comp.windows.x',
 6: 'misc.forsale',
 7: 'rec.autos',
 8: 'rec.motorcycles',
 9: 'rec.sport.baseball',
 10: 'rec.sport.hockey',
 11: 'sci.crypt',
 12: 'sci.electronics',
 13: 'sci.med',
 14: 'sci.space',
 15: 'soc.religion.christian',
 16: 'talk.politics.guns',
 17: 'talk.politics.mideast',
 18: 'talk.politics.misc',
 19: 'talk.religion.misc'}