In [147]:
from gensim.models import Word2Vec
import json
import pandas as pd
import numpy as np
import nltk
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from nltk.corpus import stopwords

In [148]:
def get_auc(Y_test, predicted):
    fpr, tpr, _ = metrics.roc_curve(Y_test, predicted[:,1])
    return metrics.auc(fpr, tpr)

In [149]:
with open('pos_reviews.json') as data_file:    
    data_pos = json.load(data_file)
with open('neg_reviews.json') as data_file:    
    data_neg = json.load(data_file)

pos_texts = data_pos.keys()[:1000]
neg_texts = data_neg.keys()[:1000]
all_texts = pos_texts + neg_texts

N_POS_TEXTS = len(pos_texts)
N_NEG_TEXTS = len(neg_texts)
target = [1]*N_POS_TEXTS + [0]*N_NEG_TEXTS

train_texts, test_texts, train_target, test_target = train_test_split(all_texts, target, test_size=0.3)

In [150]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # потребуется в дальнейшем для разбивки текстов на предложения

In [151]:
stops = set(stopwords.words("english"))

def review_to_sentences(review, tokenizer):
    # разбиваем review на предложения. Возвращаем список предложений. Каждое предложение - список слов
    #
    # 1. NLTK Tokenizer требуется для того, чтобы разбить текст на предложения. Разбиваем на предложения
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. идем по каждому предложению
    sentences = []
    for raw_sentence in raw_sentences:
        # если предолжение пустое - пропускаем его
        if len(raw_sentence) > 0:
            # иначе запускаем review_to_wordlist и добавляем в sentences
            sentences.append(review_to_words(raw_sentence))
            
    # возвращаем предложения
    return sentences

def review_to_words(review):
    words = review.split()
    words = [w for w in words if not w in stops]
    return words

In [152]:
%%time
sentences = []  # Initialize an empty list of sentences
print("Parsing sentences from training set")
for review in all_texts:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set
CPU times: user 4.56 s, sys: 755 ms, total: 5.32 s
Wall time: 5.34 s


In [153]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [154]:
%%time
# Initialize and train the model (this will take some time)
print("Training model...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

Training model...
CPU times: user 12.5 s, sys: 742 ms, total: 13.3 s
Wall time: 4.79 s


In [155]:
# если мы не собираемся больше перетренировывать модель - лучше сохранить ее закэшировать
model.init_sims(replace=True)

In [156]:
train_sentences = []
for train_text in train_texts:
    train_sentences.append(review_to_words(train_text))

In [157]:
test_sentences = []
for test_text in test_texts:
    test_sentences.append(review_to_words(test_text))

In [158]:
def makeFeatureVec(words, model, num_features):
    # берем документ и считаем средний вектор по всем словам
    # paragraph
    #
    # берем вектор, инициализируем изначально нулями
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word - содержит имена слов в словаре, чтобы по нему искать, лучше опять же, для скорости - положить его в set 

    index2word_set = set(model.index2word)
    #
    # бежим по каждому слову в документе и если слово встречается в словаре - добавляем его в ответ 
    # (прибавляем к результирующему вектору)
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # теперь соответственно делим на количество слов всего
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # эта функция берет на вход набор документов и для каждого из них возаращаетя средний вектор - полчается на выходе 2D-массив
    # 
    # инициализируем счетчик
    counter = 0.
    # 
    # точно также заполним нулями вектора (для скорости)
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # идем по всем ревью
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%5000. == 0.:
           print("Review %d of %d" % (counter, len(reviews)))
       # 
       # для каждого ревью считаем средний вектор
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # увеличиваем счетчик
       counter = counter + 1.
    return reviewFeatureVecs

In [159]:
trainDataVecs = getAvgFeatureVecs(train_sentences, model, num_features )

Review 0 of 1400


In [160]:
testDataVecs = getAvgFeatureVecs(test_sentences, model, num_features )

Review 0 of 600


In [161]:
print trainDataVecs.shape
print testDataVecs.shape

(1400, 300)
(600, 300)


In [162]:
print len(train_target)
print len(test_target)

1400
600


In [163]:
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()
model_logistic.fit(trainDataVecs, train_target)
predicted = model_logistic.predict_proba(testDataVecs)
# print get_accuracy(test_target, predicted)
print get_auc(test_target, predicted)

0.666251778094


In [164]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100, n_jobs=2)

forest = forest.fit( trainDataVecs, train_target )
result = forest.predict_proba( testDataVecs )

print result
print get_auc(test_target, result)

[[ 0.66  0.34]
 [ 0.67  0.33]
 [ 0.65  0.35]
 ..., 
 [ 0.43  0.57]
 [ 0.77  0.23]
 [ 0.2   0.8 ]]
0.723467505334


In [165]:
from sklearn import svm

%time svm_model = svm.SVC(probability=True).fit( trainDataVecs, train_target )
result = svm_model.predict_proba(testDataVecs)

print result
print get_auc(test_target, result)

CPU times: user 3.42 s, sys: 58.4 ms, total: 3.48 s
Wall time: 3.49 s
[[ 0.4868968   0.5131032 ]
 [ 0.4866472   0.5133528 ]
 [ 0.48693244  0.51306756]
 ..., 
 [ 0.48681221  0.51318779]
 [ 0.48670943  0.51329057]
 [ 0.48695707  0.51304293]]
0.349878867354


# Krot data

In [55]:
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [56]:
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [77]:
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

In [57]:
import re 
from bs4 import BeautifulSoup

def review_to_wordlist( review_text, remove_stopwords=False ):
    review_text = BeautifulSoup(review_text).get_text()
    
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. приводим слова к нижнему регистру (что тоже в целом не обязательно) и разбиваем на слова
    words = review_text.lower().split()
    #
    # 4. удаляем стоп-слова (тоже опционально)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. возвращаем список слов
    return(words)

def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # разбиваем review на предложения. Возвращаем список предложений. Каждое предложение - список слов
    #
    # 1. NLTK Tokenizer требуется для того, чтобы разбить текст на предложения. Разбиваем на предложения
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. идем по каждому предложению
    sentences = []
    for raw_sentence in raw_sentences:
        # если предолжение пустое - пропускаем его
        if len(raw_sentence) > 0:
            # иначе запускаем review_to_wordlist и добавляем в sentences
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
            
    # возвращаем предложения
    return sentences

In [66]:
%%time
sentences = []  # Initialize an empty list of sentences
print("Parsing sentences from training set")

for i, review in enumerate(train["review"]):
    sentences += review_to_sentences(review.decode('utf-8', 'ignore'), tokenizer)

Parsing sentences from training set
CPU times: user 1min 27s, sys: 3 s, total: 1min 30s
Wall time: 1min 30s


In [67]:
%%time
print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review.decode('utf-8', 'ignore'), tokenizer)

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP cl

Parsing sentences from unlabeled set
CPU times: user 2min 51s, sys: 7.21 s, total: 2min 58s
Wall time: 2min 59s


In [None]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [None]:
%%time
# Initialize and train the model (this will take some time)
print("Training model...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [None]:
# если мы не собираемся больше перетренировывать модель - лучше сохранить ее закэшировать
model.init_sims(replace=True)

In [71]:
%%time
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

CPU times: user 18.1 s, sys: 1.95 s, total: 20.1 s
Wall time: 20.3 s


In [74]:
%time trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
CPU times: user 44.4 s, sys: 735 ms, total: 45.2 s
Wall time: 45.2 s


In [78]:
%%time
print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

Creating average feature vecs for test reviews
CPU times: user 16.7 s, sys: 1.02 s, total: 17.7 s
Wall time: 17.7 s


In [79]:
%time testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
CPU times: user 44.2 s, sys: 405 ms, total: 44.6 s
Wall time: 44.7 s


In [81]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100, n_jobs=2)

print("Fitting a random forest to labeled training data...")
%time forest = forest.fit( trainDataVecs, train["sentiment"] )

# Test & extract results 
result = forest.predict( testDataVecs )

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...
CPU times: user 43.9 s, sys: 186 ms, total: 44.1 s
Wall time: 22.3 s
