In [64]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import re 
from bs4 import BeautifulSoup 
import nltk
from sklearn.feature_extraction.text import CountVectorizer

train_url = '/Users/frhyme/Downloads/labeledTrainData.tsv'
train_df = pd.read_csv(train_url, delimiter='\t', header=0, quoting=3)
#train_df = train_df[:1000] # 생각보다 계산이 오래 걸려서 일단 이걸 넣어줌

def cleaning_each_review(raw_review):
    cleaned_review = BeautifulSoup(raw_review, 'lxml').get_text()# 태그를 없애줍니다. 
    cleaned_review = re.sub("[^a-zA-Z]"," ", cleaned_review ).lower().strip()
    while "  " in cleaned_review: # 공백을 없애줍니다. 
        cleaned_review = cleaned_review.replace("  ", " ")
    words_in_cleaned_review = cleaned_review.split(" ")# 단어를 잘라줍니다. 
    stop_words = nltk.corpus.stopwords.words('english')
    # nltk에 정의된 무의미한 단어를 삭제해줍니다. 
    words_in_cleaned_review = filter(lambda w: True if w not in stop_words else False, words_in_cleaned_review)    
    return " ".join(list(words_in_cleaned_review))
train_df['cleaned_movie_review'] = train_df['review'].apply(cleaning_each_review)
print("----data cleaning complete----")

vectorizer = CountVectorizer(analyzer = "word", tokenizer = None,
                             preprocessor = None, stop_words = None, 
                             max_features = 5000) # max_feature의 술ㄹ 조절하자

# CountVectorizer의 결과는 sparse matrix로 리턴되는데, 따라서 이를 np.array로 변형해주는 것이 필요함. 
train_word_count_df = pd.DataFrame(vectorizer.fit_transform(train_df['cleaned_movie_review']).toarray(), 
                             columns=vectorizer.get_feature_names())
print("----word count vectorization complete----")
"""
값들이 표준화되지도 않았고, n-gram을 사용한 것도 아니고, 
하지만 어쨌든 간에 개별 review에 대해서 feature vector를 만들었습니다. 
이걸 사용해서, 학습을 해보려고 합니다. 믿음의 랜덤포뤠스트!!!
"""
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)# n_estimators를 늘리면서 해봐야 할것 같아요. 
rf.fit(train_word_count_df.values, train_df['sentiment'])
print("----fitting complete----")
########

# test_data에 적용해봅니다. 
test_url = '/Users/frhyme/Downloads/testData.tsv'
test_df = pd.read_csv(test_url, delimiter='\t', header=0, quoting=3)
test_df['cleaned_movie_review'] = test_df['review'].apply(cleaning_each_review)

print("----test data count vectorization----")
test_word_count_df = pd.DataFrame(vectorizer.transform(test_df['cleaned_movie_review']).toarray(), 
                             columns=vectorizer.get_feature_names())
output = pd.DataFrame( data={"id":test_df["id"], "sentiment":rf.predict(test_word_count_df.values)} )
# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )
print("----all complete upload it on kaggle----")

----data cleaning complete----
----word count vectorization complete----
----fitting complete----
----test data count vectorization----
----all complete upload it on kaggle----


In [101]:
## part 2 
import warnings
warnings.filterwarnings('ignore')
"""
- 여기서는 word-embedding을 이용합니다. 
"""
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import re 
from bs4 import BeautifulSoup 
import nltk
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer

import itertools

train_url = '/Users/frhyme/Downloads/labeledTrainData.tsv'
train_df = pd.read_csv(train_url, delimiter='\t', header=0, quoting=3)

unlabeled_train_url = '/Users/frhyme/Downloads/unlabeledTrainData.tsv'
unlabeled_train_df = pd.read_csv(unlabeled_train_url, delimiter='\t', header=0, quoting=3)

def review_to_wordlist( review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # sentence tokenizer가 밑에서 선언되는데, 그냥 여기서 디폴트로 만들어주는 게 더 좋을 수 있다. 
    # 흠...이 tokenizer는 점도 없는데 어떻게 이렇게 잘 잘라주는거지. 
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist(raw_sentence) )
    return sentences

sentences = []
for i, r in enumerate(list(train_df['review'])+list(unlabeled_train_df['review'])):
    sentences += review_to_sentences(r, tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'))
    if i % 5000==0:
        print("{} complete".format(i))

len(sentences)

0 complete
5000 complete
10000 complete
15000 complete
20000 complete
25000 complete
30000 complete
35000 complete
40000 complete
45000 complete
50000 complete
55000 complete
60000 complete
65000 complete
70000 complete


795538

In [102]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
# logging 귀찮아서안하기로 함. 대충 3-4분 걸림. 
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, 
                          workers = 4,# Number of threads to run in parallel
                          size = 300, # Word vector dimensionality                      
                          min_count = 40, # Minimum word count
                          window = 10, # Context window size
                          sample = 1e-3, # Downsample setting for frequent words
                         )
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)
print("training complete")

2018-06-09 13:48:39,310 : INFO : 'pattern' package not found; tag filters are not available for English
2018-06-09 13:48:39,357 : INFO : collecting all words and their counts
2018-06-09 13:48:39,361 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-09 13:48:39,506 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types


Training model...


2018-06-09 13:48:39,709 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-06-09 13:48:39,871 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2018-06-09 13:48:39,990 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-06-09 13:48:40,098 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-06-09 13:48:40,207 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-06-09 13:48:40,323 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-06-09 13:48:40,433 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-06-09 13:48:40,539 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2018-06-09 13:48:40,667 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50

2018-06-09 13:48:50,589 : INFO : PROGRESS: at sentence #740000, processed 16552903 words, keeping 119668 word types
2018-06-09 13:48:50,728 : INFO : PROGRESS: at sentence #750000, processed 16771230 words, keeping 120295 word types
2018-06-09 13:48:50,870 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2018-06-09 13:48:51,000 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2018-06-09 13:48:51,240 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2018-06-09 13:48:51,411 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2018-06-09 13:48:51,505 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795538 sentences
2018-06-09 13:48:51,509 : INFO : Loading a fresh vocabulary
2018-06-09 13:48:52,484 : INFO : min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2018-06-09 13:

2018-06-09 13:49:49,864 : INFO : EPOCH 2 - PROGRESS: at 75.76% examples, 434396 words/s, in_qsize 6, out_qsize 1
2018-06-09 13:49:50,896 : INFO : EPOCH 2 - PROGRESS: at 78.61% examples, 430818 words/s, in_qsize 6, out_qsize 1
2018-06-09 13:49:51,928 : INFO : EPOCH 2 - PROGRESS: at 82.15% examples, 431075 words/s, in_qsize 7, out_qsize 1
2018-06-09 13:49:52,929 : INFO : EPOCH 2 - PROGRESS: at 86.25% examples, 434668 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:49:53,942 : INFO : EPOCH 2 - PROGRESS: at 90.59% examples, 439122 words/s, in_qsize 7, out_qsize 1
2018-06-09 13:49:54,960 : INFO : EPOCH 2 - PROGRESS: at 95.17% examples, 443950 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:49:55,987 : INFO : EPOCH 2 - PROGRESS: at 99.66% examples, 448311 words/s, in_qsize 5, out_qsize 1
2018-06-09 13:49:56,007 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-09 13:49:56,034 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-09 13:49:56,038 : I

2018-06-09 13:50:50,980 : INFO : EPOCH 5 - PROGRESS: at 12.12% examples, 506854 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:51,983 : INFO : EPOCH 5 - PROGRESS: at 16.35% examples, 513948 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:53,006 : INFO : EPOCH 5 - PROGRESS: at 20.51% examples, 513177 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:54,070 : INFO : EPOCH 5 - PROGRESS: at 23.70% examples, 490699 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:55,092 : INFO : EPOCH 5 - PROGRESS: at 25.45% examples, 451469 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:56,139 : INFO : EPOCH 5 - PROGRESS: at 28.26% examples, 437371 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:57,172 : INFO : EPOCH 5 - PROGRESS: at 31.32% examples, 430219 words/s, in_qsize 8, out_qsize 0
2018-06-09 13:50:58,177 : INFO : EPOCH 5 - PROGRESS: at 34.39% examples, 425658 words/s, in_qsize 7, out_qsize 0
2018-06-09 13:50:59,189 : INFO : EPOCH 5 - PROGRESS: at 37.68% examples, 424884 words/s, in_qsiz

In [126]:
doesnt_match_sentences = [
    "man woman child kitchen", 
    "france england germany berlin", 
    "paris berlin london austria",
    'movie actor actress director', 
    'movie actor actress', 
    'movie cinema film art',
    'actor actress director', 
]
for s in doesnt_match_sentences:
    print("{} => {}".format(s, model.doesnt_match(s.split())))
print("---doesnt match over---")
for w in ['man', 'movie', 'soldier']:
    similar_words = ", ".join([s_w[0] for s_w in model.most_similar(w)][:5])
    print("'{}' is most similar with ({})".format(w, similar_words))
print("---most similar over---")

man woman child kitchen => kitchen
france england germany berlin => berlin
paris berlin london austria => paris
movie actor actress director => movie
movie actor actress => movie
movie cinema film art => art
actor actress director => director
---doesnt match over---
'man' is most similar with (woman, lady, monk, lad, farmer)
'movie' is most similar with (film, flick, movies, it, sequel)
'soldier' is most similar with (army, warrior, navy, marine, dictator)
---most similar over---


In [135]:
"""
- 지난번에 만든 model을 save해두었기 때문에, 이후에도 이 model을 그대로 사용할 수 있습니다. 개꿀!
- 흠, 이런 식이면 keras에서도 모델을 저장해두고, 나중에 그대로 사용할 수 있는 것 아닐까? 그러면 좀 편할것 같은데 흐음. 
    - training이 오래 걸리지, prediction은 오래 걸리지 않기 때문에
"""
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")
print("the shape of each word vector: {}".format(model['man'].shape))

2018-06-09 14:08:06,792 : INFO : loading Word2Vec object from 300features_40minwords_10context
2018-06-09 14:08:07,570 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2018-06-09 14:08:07,574 : INFO : setting ignored attribute vectors_norm to None
2018-06-09 14:08:07,576 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2018-06-09 14:08:07,578 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2018-06-09 14:08:07,580 : INFO : setting ignored attribute cum_table to None
2018-06-09 14:08:07,583 : INFO : loaded 300features_40minwords_10context


the shape of each word vector: (300,)


In [174]:
## after making train_df and test_df 
def makeFeatureVec(words, model, num_features):
    """
    - word list의 개별 semantic vector의 평균을 계산하여 돌려주는 함수
    - method가 조금 달라서 고쳐준 부분이 있음 
    """
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.0
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.0
            featureVec = np.add(featureVec, model[word])
    return np.divide(featureVec,nwords) # Divide the result by the number of words to get the average

def getAvgFeatureVecs(reviews, model, num_features):
    """
    - makeFeatureVec가 개별 word list로부터 semantic vector의 평균을 계산하여 돌려줬다면
    - 이 함수의 경우는 review list로부터 semantice vector의 평균 list를 돌려준다. 
    - 이전에 word2vec 학습할때 num_feature의 경우 300으로 했으므로 아마도 300이어야 할듯
    - 이건 굳이 argument로 넣을 필요 없이, 만든 model의 num_feature를 내부에서 돌리는게 더 효율적이지 않나? 
    """
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # Loop through the reviews
    for i, review in enumerate(reviews):
        if i%5000. == 0.:
            print("Review {:.0f} of {:.0f}".format(i, len(reviews)))
        reviewFeatureVecs[i] = makeFeatureVec(review, model, num_features)
    return reviewFeatureVecs

clean_train_reviews = []
for review in train_df["review"]:
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features=300 )
print("---train data over---")

clean_test_reviews = []
for review in test_df["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features=300)
print("---test data over---")

#######
### random forest fitting 
#######
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier( n_estimators = 500)
print("Fitting a random forest to labeled training data...")
rf = rf.fit( trainDataVecs, train_df["sentiment"] )
# Write the test results 
output = pd.DataFrame( data={"id": test_df["id"], "sentiment": rf.predict( testDataVecs )} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
print('complete')


Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
---train data over---
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 o

In [212]:
# using clustering after word-embedding 
from sklearn.cluster import KMeans
import time

start = time.time() # Start time
word_vectors = model.wv.syn0
# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = word_vectors.shape[0]//5 )
idx = kmeans_clustering.fit_predict( word_vectors )
# Get the end time and print how long the process took
print("Time taken for K Means clustering: {} seconds".format(time.time()- start))
##### clustering over 
word_centroid_map = dict(zip(model.wv.index2word, idx))
# cluster를 key로 두고, 포함되는 모든 word를 리스트로 value로 넣은 dict
word_centroid_map_grouped_dict = {cluster: [] for cluster in range(min(word_centroid_map.values()), 
                                                              max(word_centroid_map.values())+1)}
for w, k in word_centroid_map.items():
    word_centroid_map_grouped_dict[k].append(w)

# 상위 10개의 클러스트터를 출력해서 한번 봅닌다. 
for k in word_centroid_map_grouped_dict.keys():
    if k >10:
        break
    else:
        print("cluster {}:".format(k))
        print(word_centroid_map_grouped_dict[k])
        print("-------")
### 
def create_bag_of_centroids( wordlist, word_centroid_map ):
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( max( word_centroid_map.values() ) + 1, dtype="float32" )
    # cluster에 속하면 count를 늘림. 단순함. 
    for word in wordlist:
        if word in word_centroid_map.keys():
            bag_of_centroids[word_centroid_map[word]] += 1
    # Return the "bag of centroids"
    return bag_of_centroids
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train_df["review"].size, word_vectors.shape[0]//5), dtype="float32" )
for i, review in enumerate(clean_train_reviews):
    train_centroids[i] = create_bag_of_centroids( review, word_centroid_map )
# Repeat for test reviews 
test_centroids = np.zeros(( test_df["review"].size, word_vectors.shape[0]//5), dtype="float32" )
for i, review in enumerate(clean_test_reviews):
    test_centroids[i] = create_bag_of_centroids( review, word_centroid_map )
    
# Fit a random forest and extract predictions 
rf = RandomForestClassifier(n_estimators = 200)
# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
rf.fit(train_centroids,train_df["sentiment"])
# Write the test results 
output = pd.DataFrame(data={"id": test_df["id"], "sentiment": rf.predict(test_centroids)})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3)
print('complete')

Fitting a random forest to labeled training data...
complete
