In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_path = os.path.join('kaggle-data', 'labeledTrainData.tsv')
test_path = os.path.join('kaggle-data', 'testData.tsv')
train = pd.read_csv(train_path, delimiter = '\t')
test = pd.read_csv(test_path, delimiter = '\t')
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
X_train, X_test, y_train, y_test =  train_test_split(train['review'], train['sentiment'], 
                                                     test_size = 0.2, random_state = 1234)

In [4]:
vectorizer = TfidfVectorizer(# ngram_range = (1, 2), 
                             stop_words = 'english', 
                             max_features = 5000)
logistic = LogisticRegression()
pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('logistic', logistic)
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
roc_auc_score(y_test, y_pred)

0.87686237545015278

In [5]:
result = pipe.predict(test['review'])
output = pd.DataFrame(data= {'id': test['id'], 'sentiment': result})
output.to_csv('submission.csv', index = False) 

In [6]:
# import gensim
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument

def doc2vec_preprocess(corpus):
    for i, line in enumerate(corpus):
        # simple_preprocess
        # tokenize text into individual words, 
        # remove punctuation, set to lowercase
        preprocessed = simple_preprocess(line)
        yield TaggedDocument(preprocessed, [i])

In [7]:
from gensim.models.doc2vec import TaggedLineDocument

In [8]:
from tqdm import trange
from gensim.models import Doc2Vec
# train_corpus = doc2vec_preprocess(X_train)
# train_corpus = list(train_corpus) # hacky .....

corpus = doc2vec_preprocess(train['review'])
corpus = list(corpus) # hacky ....., modify from TaggedLineDocument perhaps

model = Doc2Vec(min_count = 1, window = 10, size = 100, 
                sample = 1e-4, negative = 5, workers = 7)
model.build_vocab(corpus)
for _ in trange(8):
    model.train(corpus)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha

100%|██████████| 8/8 [02:31<00:00, 18.98s/it]


In [13]:
inferred_vectors = []
for doc in corpus:
    # each line is a TaggedDocument namedtuple, where we can
    # access the words attribute and infer their word vectors
    inferred_vector = model.infer_vector(doc.words)
    inferred_vectors.append(inferred_vector)
    
inferred_vectors = np.asarray(inferred_vectors)
X_train_vector = inferred_vectors[X_train.index]
X_test_vector = inferred_vectors[X_test.index]

In [14]:
logistic = LogisticRegression()
logistic.fit(X_train_vector, y_train)
y_pred = logistic.predict(X_test_vector)
roc_auc_score(y_test, y_pred)

0.78299103383555835

In [None]:
hi

In [None]:
from collections import Counter

ranks = []
for idx, doc in enumerate(train_corpus):
    # each line is a TaggedDocument namedtuple, where we can
    # access the words attribute and infer their word vectors
    inferred_vector = model.infer_vector(doc.words)
    sims = model.docvecs.most_similar([inferred_vector], topn = model.corpus_count)
    rank = [doc_id for doc_id, _ in sims].index(idx)
    ranks.append(rank)

Counter(ranks)

In [None]:
model.docvecs

# Reference

- [Notebook: Doc2Vec Tutorial on the Lee Dataset](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb)