In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/all.csv')

In [4]:
df.head()

Unnamed: 0,id,review,sentiment
0,train_5814_8,With all this stuff going down at the moment w...,1
1,train_2381_9,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,train_7759_3,The film starts with a manager (Nicholas Bell)...,0
3,train_3630_4,It must be assumed that those who praised this...,0
4,train_9495_8,Superbly trashy and wondrously unpretentious 8...,1


In [5]:
import word2vec

In [6]:
model = word2vec.load('data/doc2vec-stop-words.bin')

In [7]:
model.vectors.shape

(278170, 300)

In [8]:
is_train = np.char.startswith(model.vocab, '_*train')

In [9]:
is_test = np.char.startswith(model.vocab, '_*test')

In [10]:
model.vocab[is_train]

array([u'_*train_5868_8', u'_*train_9584_10', u'_*train_11688_10', ...,
       u'_*train_4243_2', u'_*train_5968_4', u'_*train_12182_1'], 
      dtype='<U78')

In [11]:
model.vocab[is_test]

array([u'_*test_12311_10', u'_*test_8348_2', u'_*test_5828_4', ...,
       u'_*test_2531_1', u'_*test_7772_8', u'_*test_11465_10'], 
      dtype='<U78')

In [12]:
train_idx = np.nonzero(is_train)
test_idx = np.nonzero(is_test)

In [13]:
model.vocab[train_idx]

array([u'_*train_5868_8', u'_*train_9584_10', u'_*train_11688_10', ...,
       u'_*train_4243_2', u'_*train_5968_4', u'_*train_12182_1'], 
      dtype='<U78')

In [14]:
model.vocab[test_idx]

array([u'_*test_12311_10', u'_*test_8348_2', u'_*test_5828_4', ...,
       u'_*test_2531_1', u'_*test_7772_8', u'_*test_11465_10'], 
      dtype='<U78')

In [15]:
model.vectors[train_idx].shape, model.vectors[test_idx].shape

((25000, 300), (25000, 300))

In [16]:
X = model.vectors[train_idx]

In [17]:
X.shape

(25000, 300)

In [18]:
y = []
for i in model.vocab[train_idx]:
    row = df[df['id'] == i[2:]]
    sentiment = row.sentiment.values[0]
    y.append(sentiment)

In [19]:
len(y)

25000

In [27]:
for idx in np.random.random_integers(25000, size=(25,)):
    id_ = model.vocab[train_idx][idx]
    y_ = y[idx]
    row = df[df['id'] == id_[2:]] 
    sentiment = row.sentiment.values[0]
    assert sentiment == y_

## sklearn

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [41]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=0)

In [42]:
rfc = RandomForestClassifier(n_estimators=100)

In [43]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [44]:
accuracy_score(y_val, rfc.predict(X_val))

0.7104242424242424

## submit

In [45]:
rfc = RandomForestClassifier(n_estimators=100)

In [46]:
rfc.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:
test_ids = model.vocab[is_test]

In [48]:
test_ids

array([u'_*test_12311_10', u'_*test_8348_2', u'_*test_5828_4', ...,
       u'_*test_2531_1', u'_*test_7772_8', u'_*test_11465_10'], 
      dtype='<U78')

In [51]:
X_test = model.vectors[is_test]

In [52]:
y_test = rfc.predict(X_test)

In [55]:
submission = pd.read_csv('data/sampleSubmission.csv')

In [66]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [76]:
a = test_ids.astype(str).astype(object)
a

array(['_*test_12311_10', '_*test_8348_2', '_*test_5828_4', ...,
       '_*test_2531_1', '_*test_7772_8', '_*test_11465_10'], dtype=object)

In [78]:
b = submission['id'].apply(lambda x: '_*test_%s' % x).values
b

array(['_*test_12311_10', '_*test_8348_2', '_*test_5828_4', ...,
       '_*test_2531_1', '_*test_7772_8', '_*test_11465_10'], dtype=object)

In [80]:
assert (a == b).all()

In [81]:
submission['sentiment'] = y_test

In [83]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [88]:
submission.sentiment = submission.sentiment.astype(int)

In [89]:
submission.to_csv('rf-submission.csv', index=None, quoting=2)