In [0]:
# -*- coding: utf-8 -*-

In [0]:
import re
import pandas as pd
import numpy as np
import spacy
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

In [4]:
news = fetch_20newsgroups(subset='all')
news_df = pd.DataFrame({'News' : news.data, 'Target' : news.target})

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
def data_cleansing(df):
    delete_email = re.sub(r'\b[\w\+]+@[\w]+.[\w]+.[\w]+.[\w]+\b', ' ', df)
    delete_number = re.sub(r'\b|\d+|\b', ' ',delete_email)
    delete_non_word = re.sub(r'\b[\W]+\b', ' ', delete_number)
    cleaning_result = ' '.join(delete_non_word.split())
    return cleaning_result

news_df.loc[:, 'News'] = news_df['News'].apply(data_cleansing)

In [0]:
from nltk.corpus import stopwords
import nltk

In [7]:
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [0]:
stop_words = stopwords.words('english')
stop_words.extend(['ms','mr','http','www','co','html','goo_gl','blog','rt','https','bit','goo','gl','ly','com','nytimes','ow','new','york','times',
                   'news','also','even','still','much','day','could','nytime','washington','photo','section','\'s','inc','washpost',
                'thing','something','percent','und','literature', 'may', 'paper', 'der','die','eine','von','however','elsevier',
                'author','well','rights','reserve','_reserve','reserved','be','que','fur','das','den','auf','ein','des','would','latime','nyt',
                'say','org','uk','eu','fb','do','govt','pic_twitter','pic','twitter','site','pm','website','twitt','net','ca',
                'web','cc','lnkd','linkedin','away','soon','maybe','bn','pdf','et','al','wsj','report','bloomberg','tinyurl','From',',The']) #불용어 확장 필요시 추가
stop_words=set(stop_words)

In [0]:
news_df['News'] = news_df['News'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [11]:
####TF###
tf_vectorizer = CountVectorizer(analyzer='word',
                             lowercase=False,
                             tokenizer=None,
                             preprocessor=None,
                             min_df=2,
                             ngram_range=(1,1) #한국어 : ngram_range=(1,2)
                             #max_features=1000 #max_feature는?
                             )

tf_vector = tf_vectorizer.fit_transform(news_df['News'].astype(str))

tf_vector


<18846x74568 sparse matrix of type '<class 'numpy.int64'>'
	with 2104915 stored elements in Compressed Sparse Row format>

In [12]:
tf_scores = tf_vector.toarray().sum(axis=0)
tf_idx = np.argsort(-tf_scores)
tf_scores = tf_scores[tf_idx]
tf_vocab = np.array(tf_vectorizer.get_feature_names())[tf_idx]
#plt.bar(range(len(tf_scores)), tf_scores)
#plt.show()
print(list(zip(tf_vocab, tf_scores))[:100])

[('AX', 62484), ('The', 25641), ('Subject', 19497), ('Lines', 18929), ('Organization', 18281), ('In', 16718), ('Re', 13247), ('one', 12901), ('writes', 12485), ('article', 11346), ('It', 9985), ('like', 9312), ('University', 9250), ('people', 8986), ('If', 8940), ('know', 8525), ('Host', 8126), ('Posting', 8118), ('get', 7984), ('think', 7360), ('This', 7302), ('edu', 6539), ('time', 6430), ('use', 6410), ('Apr', 5684), ('You', 5383), ('good', 5259), ('To', 5058), ('way', 4873), ('What', 4794), ('see', 4759), ('And', 4644), ('make', 4574), ('MAX', 4499), ('two', 4492), ('God', 4347), ('But', 4289), ('Distribution', 4228), ('many', 4163), ('right', 4147), ('Nntp', 4130), ('want', 4108), ('first', 4016), ('They', 3983), ('NNTP', 3971), ('said', 3955), ('used', 3945), ('There', 3816), ('system', 3672), ('anyone', 3670), ('work', 3650), ('need', 3641), ('world', 3582), ('He', 3502), ('us', 3477), ('We', 3452), ('problem', 3439), ('really', 3394), ('believe', 3369), ('Reply', 3239), ('back'