## ucl 머신러닝 레파지토리

머신러닝 실습해볼 수 있는 데이터 제공
<a href = https://archive.ics.uci.edu/ml/machine-learning-databases>링크</a>

In [1]:
import requests

## 파일 다운로드 및 오픈

In [2]:
res = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip')

In [3]:
with open('sentiment.zip','wb') as f:
    f.write(res.content)

In [4]:
from zipfile import ZipFile

In [5]:
z = ZipFile('sentiment.zip')

In [6]:
data = z.open('sentiment labelled sentences/imdb_labelled.txt')

## 데이터 불러오기

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv(data, sep='\t', header = None)

In [10]:
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [11]:
df.shape

(748, 2)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
df.columns = ['review','sentiment']

## TDM 만들기

   - CountVectorizer 는 토큰이 문서별로 몇 번 등장했는지 행렬로 정리해준다.

    [옵션]
    
   - max_features: 단어문서행렬에 포함시킬 최대의 단어수를 말한다. (빈도순으로 짜름)
    
   - stop_words: 분석에서 제외할 불용어를 설정하는 옵션이며, english로 설정하면 관사, 전치사 등을 제외한다. 다른 언어는 리스트등의 형태로 불용어 목록을 넘겨주어 사용한다.

In [15]:
cv = CountVectorizer(max_features=500, stop_words='english')

In [16]:
tdm = cv.fit_transform(df['review'])

In [19]:
# 압축된 상태
tdm

<748x500 sparse matrix of type '<class 'numpy.int64'>'
	with 3433 stored elements in Compressed Sparse Row format>

In [32]:
3433/(748*500) *100

0.9179144385026738

## 단어목록

In [22]:
cv.get_feature_names()[:50]

['10',
 '20',
 '90',
 'ability',
 'absolutely',
 'acting',
 'action',
 'actor',
 'actors',
 'actress',
 'actresses',
 'actually',
 'addition',
 'adorable',
 'aerial',
 'age',
 'amazing',
 'angel',
 'animation',
 'anne',
 'annoying',
 'appearance',
 'appreciate',
 'art',
 'attempt',
 'attention',
 'audience',
 'avoid',
 'away',
 'awesome',
 'awful',
 'bad',
 'barely',
 'beautiful',
 'beginning',
 'believable',
 'believe',
 'best',
 'better',
 'big',
 'billy',
 'bit',
 'black',
 'book',
 'bore',
 'bored',
 'boring',
 'bought',
 'brain',
 'brilliant']

## 단어별 총빈도

In [27]:
word_count=pd.DataFrame({
    '단어': cv.get_feature_names(),
    '빈도': tdm.sum(axis=0).flat
})

In [25]:
tdm.sum(axis=0).flat

<numpy.flatiter at 0x7f8254760e00>

In [28]:
word_count

Unnamed: 0,단어,빈도
0,10,29
1,20,3
2,90,6
3,ability,3
4,absolutely,9
...,...,...
495,wrong,3
496,year,5
497,years,14
498,yes,3


In [30]:
word_count.sort_values('빈도',ascending=False).head(20)

Unnamed: 0,단어,빈도
281,movie,182
155,film,163
31,bad,71
227,just,63
181,good,58
240,like,48
5,acting,43
437,time,43
344,really,41
185,great,41


In [34]:
word_count.sort_values('빈도',ascending=False).to_csv('word_cout.csv',index=False)