In [5]:
import nltk

In [6]:
with open('Obama_2016.txt', 'r') as f:
    doc = f.read()
    
# 읽고자하는 txt파일이 정확히 Jupyter File이 있는 폴더에 "함께" 들어있어야 한다.

In [7]:
doc[0:500]

"anuary 12, 2016\nThank you. Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I've come here to report on the State of the Union. And for this final one, I'm going to try to make it a little shorter. I know some of you are antsy to get back to Iowa. [Laughter] I've been there. I'll be shaking hands afterwards if you want some tips. [Laughter]\n\nNow, I understand that because it's an election season, expectations for what we will achieve t"

In [19]:
# Tokenizing

doc_tokens = nltk.word_tokenize(doc)
doc_tokens[0:50]

['anuary',
 '12',
 ',',
 '2016',
 'Thank',
 'you',
 '.',
 'Mr.',
 'Speaker',
 ',',
 'Mr.',
 'Vice',
 'President',
 ',',
 'Members',
 'of',
 'Congress',
 ',',
 'my',
 'fellow',
 'Americans',
 ':',
 'Tonight',
 'marks',
 'the',
 'eighth',
 'year',
 'that',
 'I',
 "'ve",
 'come',
 'here',
 'to',
 'report',
 'on',
 'the',
 'State',
 'of',
 'the',
 'Union',
 '.',
 'And',
 'for',
 'this',
 'final',
 'one',
 ',',
 'I',
 "'m",
 'going']

In [20]:
# Lemmatizing

doc_lemma = []
lemma = nltk.wordnet.WordNetLemmatizer()
for token in doc_tokens:
    doc_lemma.append(lemma.lemmatize(token))
doc_lemma[0:50]

['anuary',
 '12',
 ',',
 '2016',
 'Thank',
 'you',
 '.',
 'Mr.',
 'Speaker',
 ',',
 'Mr.',
 'Vice',
 'President',
 ',',
 'Members',
 'of',
 'Congress',
 ',',
 'my',
 'fellow',
 'Americans',
 ':',
 'Tonight',
 'mark',
 'the',
 'eighth',
 'year',
 'that',
 'I',
 "'ve",
 'come',
 'here',
 'to',
 'report',
 'on',
 'the',
 'State',
 'of',
 'the',
 'Union',
 '.',
 'And',
 'for',
 'this',
 'final',
 'one',
 ',',
 'I',
 "'m",
 'going']

In [22]:
doc_lemma_tagged = nltk.pos_tag(doc_lemma)
doc_lemma_tagged[0:20]

[('anuary', 'JJ'),
 ('12', 'CD'),
 (',', ','),
 ('2016', 'CD'),
 ('Thank', 'NNP'),
 ('you', 'PRP'),
 ('.', '.'),
 ('Mr.', 'NNP'),
 ('Speaker', 'NNP'),
 (',', ','),
 ('Mr.', 'NNP'),
 ('Vice', 'NNP'),
 ('President', 'NNP'),
 (',', ','),
 ('Members', 'NNP'),
 ('of', 'IN'),
 ('Congress', 'NNP'),
 (',', ','),
 ('my', 'PRP$'),
 ('fellow', 'JJ')]

In [27]:
# Removing Stop Words

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.update([',', '.', '?', '!', ':', ';', '[', ']'])
doc_filtered = [word for word in doc_lemma if word not in stop_words and len(word) > 3]
doc_filtered[0:20]

['anuary',
 '2016',
 'Thank',
 'Speaker',
 'Vice',
 'President',
 'Members',
 'Congress',
 'fellow',
 'Americans',
 'Tonight',
 'mark',
 'eighth',
 'year',
 'come',
 'report',
 'State',
 'Union',
 'final',
 'going']

In [44]:
## Restart

# Building a custom corpus

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
import nltk

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpusdir = 'APPLE INC_2011-2016/'
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
    
corpus = ['APPLE INC_2011.txt', 'APPLE INC_2012.txt', 'APPLE INC_2013.txt', 'APPLE INC_2014.txt', 'APPLE INC_2015.txt', 'APPLE INC_2016.txt' ]

In [37]:
for doc in corpus:
    doc_tokens = nltk.word_tokenize(doc)
    
doc_lemma = []
lemma = nltk.wordnet.WordNetLemmatizer()
for token in doc_tokens:
    doc_lemma.append(lemma.lemmatize(token))

In [38]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.update([',', '.', '?', '!', ':', ';', '[', ']'])
filtered_doc = [word for word in doc_lemma if word not in stop_words]

In [45]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim

from gensim.utils import simple_preprocess, tokenize
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

corpus = [tokenize(doc) for doc in corpus]
lexicon = gensim.corpora.Dictionary(corpus)
tfidf = gensim.models.TfidfModel(dictionary = lexicon, normalize = True)
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]

print(corpus)
print(lexicon)
print(tfidf)
print(vectors)

[<generator object simple_tokenize at 0x000002AB46892678>, <generator object simple_tokenize at 0x000002AB468C6780>, <generator object simple_tokenize at 0x000002AB468C67D8>, <generator object simple_tokenize at 0x000002AB468C6888>, <generator object simple_tokenize at 0x000002AB468C6308>, <generator object simple_tokenize at 0x000002AB468C6678>]
Dictionary(3 unique tokens: ['APPLE', 'INC_', 'txt'])
TfidfModel(num_docs=6, num_nnz=18)
[[], [], [], [], [], []]


In [87]:
## Restart 2

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim

from gensim.utils import simple_preprocess, tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

import os

import numpy as np

In [90]:
import sys
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def make_corpus():
    corpus = [x for x in os.listdir('C:/Users/안신혜/★ Text Analysis Study/Jupyter Notebooks for Exercise/APPLE INC_2011-2016') if x.endswith("*.txt")]
    for doc in corpus:
        filePath= 'C:/Users/안신혜/★ Text Analysis Study/Jupyter Notebooks for Exercise/APPLE INC_2011-2016' + os.path.splitext(file)[0] + ".txt"
        with open(filePath, 'r') as infile:
            content = infile.read()
            yield content 


In [83]:

# vectorizer = TfidfVectorizer(stop_words='english',use_idf=True, max_df=0.7, smooth_idf=True)
# vectorizer.fit_transform(corpus)

In [91]:
corpus = make_corpus()

corpus = [tokenize(doc) for doc in corpus]
lexicon = gensim.corpora.Dictionary(corpus)
tfidf = gensim.models.TfidfModel(dictionary = lexicon, normalize = True)
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]

print(corpus)
print(lexicon)
print(tfidf)
print(vectors)


[]
Dictionary(0 unique tokens: [])
TfidfModel(num_docs=0, num_nnz=0)
[]


In [95]:
## Restart 3

from nltk.corpus.reader.plaintext import PlaintextCorpusReader

DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)\.*'

corpus = PlaintextCorpusReader('C:/Users/안신혜/★ Text Analysis Study\Jupyter Notebooks for Exercise/APPLE INC_2011-2016', DOC_PATTERN, CAT_PATTERN)

In [96]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim

In [97]:
from gensim.utils import simple_preprocess, tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

corpus = [tokenize(doc) for doc in corpus]
lexicon = gensim.corpora.Dictionary(corpus)
tfidf = gensim.models.TfidfModel(dictionary = lexicon, normalize = True)
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]

print(corpus)
print(lexicon)
print(tfidf)
print(vectors)

TypeError: 'PlaintextCorpusReader' object is not iterable

In [127]:
## Restart 4

from nltk.corpus import PlaintextCorpusReader

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora
import gensim
from gensim.utils import simple_preprocess, tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary


filepath = 'C:/Users/안신혜/★ Text Analysis Study\Jupyter Notebooks for Exercise/APPLE INC_2011-2016'
corpus = PlaintextCorpusReader(filepath,'.*', encoding='utf-8')
fids = corpus.fileids()
docs = [corpus.words(f) for f in fids]


In [128]:
corpus = [tokenize(doc) for doc in corpus.words(fids)]
lexicon = gensim.corpora.Dictionary(corpus)
tfidf = gensim.models.TfidfModel(dictionary = lexicon, normalize = True)
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]

print(corpus)
print(lexicon)
print(tfidf)
print(vectors)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 18: invalid start byte