In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import re 
from bs4 import BeautifulSoup 
import nltk
import itertools

train_url = '/Users/frhyme/Downloads/labeledTrainData.tsv'
train_df = pd.read_csv(train_url, delimiter='\t', header=0, quoting=3)

unlabeled_train_url = '/Users/frhyme/Downloads/unlabeledTrainData.tsv'
unlabeled_train_df = pd.read_csv(unlabeled_train_url, delimiter='\t', header=0, quoting=3)

test_url = '/Users/frhyme/Downloads/testData.tsv'
test_df = pd.read_csv(test_url, delimiter='\t', header=0, quoting=3)
print("----reading tsv over----")

In [12]:
"""
"""
def review_to_wordlist( review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)
def review_to_sentences(review, tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), remove_stopwords=False):
    # sentence tokenizer가 밑에서 선언되는데, 그냥 여기서 디폴트로 만들어주는 게 더 좋을 수 있다. 
    # 흠...이 tokenizer는 점도 없는데 어떻게 이렇게 잘 잘라주는거지. 
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist(raw_sentence) )
    return sentences



----cleaning sentence over----


KeyboardInterrupt: 

In [None]:
"""
- word-embedding을 하려면 일단 존재하는 모든 Review를 활용해서
"""
for df in [train_df, test_df, unlabeled_train_df]:
    df['cleaned_review'] = df['review'].apply(review_to_wordlist)
print("----cleaning sentence over----")
for df in [train_df, test_df, unlabeled_train_df]:
    df['sentences'] = df['review'].apply(review_to_sentences)
print("----make sentence over----")

sentences = []
for df in [train_df, test_df, unlabeled_train_df]:
    sentences+=list(itertools.chain.from_iterable(df['cleaned_review']))
len(sentences)

In [6]:
from gensim.models import word2vec
print("----word embedding model Training----")
sentences = list(train_df['cleaned_review']
model = word2vec.Word2Vec(sentences, 
                          workers = 4,# Number of threads to run in parallel
                          size = 500, # Word vector dimensionality                      
                          min_count = 40, # Minimum word count
                          window = 10, # Context window size
                          sample = 1e-3, # Downsample setting for frequent words
                         )
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "500features_40minwords_10context"
# 저장해뒀기 때문에, model = Word2Vec.load("500features_40minwords_10context") 의 형식으로 로드해서 사용할 수 있다. 
model.save(model_name)
print("----word embedding model complete----")

0        [watching, time, chasers, it, obvious, that, i...
1        [i, saw, this, film, about, years, ago, and, r...
2        [minor, spoilersin, new, york, joan, barnard, ...
3        [i, went, to, see, this, film, with, a, great,...
4        [yes, i, agree, with, everyone, on, this, site...
5        [jennifer, ehle, was, sparkling, in, pride, an...
6        [amy, poehler, is, a, terrific, comedian, on, ...
7        [a, plane, carrying, employees, of, a, large, ...
8        [a, well, made, gritty, science, fiction, movi...
9        [incredibly, dumb, and, utterly, predictable, ...
10       [after, reading, the, comments, for, this, mov...
11       [it, s, hard, to, describe, elfen, lied, to, s...
12       [of, all, the, bile, inducing, vomitoriums, to...
13       [this, is, quite, an, underrated, hitchcock, m...
14       [being, a, huge, gary, oldman, fan, i, had, hi...
15       [for, the, most, part, the, acting, was, poorl...
16       [ram, gopal, varma, does, it, again, yet, anot.

In [7]:
train_df['cleaned_review'].shape

(25000,)

In [10]:
train_df['cleaned_review'].values

array([ list(['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', 'maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', 'moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', 'some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'is', 'al