In [1]:
from __future__ import division, print_function
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
# format: phrase|phrase_id
dictionary_file = 'sample_data/stanfordSentimentTreebank/dictionary.txt'

# format: phrase_id|score
phrases_and_scores = 'sample_data/stanfordSentimentTreebank/sentiment_labels.txt'

In [3]:
import codecs
with codecs.open(dictionary_file, encoding='utf8') as phrases:
    phrase_map = {}
    for l in phrases:
        phrase, phrase_id = l.strip().split('|')
        phrase_map[phrase] = int(phrase_id)
    

In [4]:
# TASK (1) - create a lookup index for the phrases - DONE!

In [5]:
phrase_map.items()[:10]

[(u'with restraint', 13162),
 (u'who is utterly unlikeable', 178229),
 (u'smart , edgy', 37041),
 (u'have been Problem Child', 160805),
 (u'will want to see over and over again .', 238800),
 (u'proves unrelentingly grim -- and equally engrossing', 90803),
 (u'just feels generic', 164309),
 (u'to bring happiness to their loved ones', 39150),
 (u'woods', 13239),
 (u'clotted', 117111)]

In [6]:
# TASK (2) - map the phrases to their scores
with codecs.open(phrases_and_scores, encoding='utf8') as phrases:
    phrase_scores = {}
    for l in phrases.read().split('\n')[1:-1]:
        phrase, phrase_score = l.strip().split('|')
        phrase_scores[int(phrase)] = float(phrase_score)

# task 2a (optional) - map the scores in range (0-1) to five classes (1-5 or 0-4)

In [7]:
len(phrase_scores.keys()) == len(phrase_map.keys())
type(phrase_scores.items()[0][0]) == 'unicode'
type(phrase_scores.items()[0][0]) == int
phrase_scores.items()[0][1]

0.5

In [8]:
# map phrase ids to scores
id_to_score = {k:phrase_scores[v] for k,v in phrase_map.items()}

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# TASK (3) - extract unigram features for each phrase from the dataset
# hint: use - sklearn.feature_extraction.text import CountVectorizer
# http://scikit-learn.org/stable/modules/feature_extraction.html

corpus = [k for k,v in phrase_map.items()]

In [10]:
corpus[:10]

[u'with restraint',
 u'who is utterly unlikeable',
 u'smart , edgy',
 u'have been Problem Child',
 u'will want to see over and over again .',
 u'proves unrelentingly grim -- and equally engrossing',
 u'just feels generic',
 u'to bring happiness to their loved ones',
 u'woods',
 u'clotted']

In [11]:
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
y = [phrase_scores[v] for k,v in phrase_map.items()]

In [12]:
X.shape

(239232, 18021)

In [47]:
len(y)
y[:10]

[0.41667,
 0.38889,
 0.72222,
 0.5,
 0.95833,
 0.36111,
 0.30556,
 0.73611,
 0.5,
 0.30556]

In [14]:
X.shape
small_X = X[:1000]
small_y = y[:1000]

random_X = np.random.random(small_X.shape)
random_y = np.random.random(len(small_y))

In [15]:
# TASK (4) - split your data into train and test
# Hint: use python indexing syntax, or http://scikit-learn.org/stable/modules/cross_validation.html
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    small_X, small_y, test_size=0.3, random_state=0)
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(
#     random_X, random_y, test_size=0.3, random_state=0)


In [16]:
# TASK (5) - use sklearn to train a classifier or regression model on your training data
from sklearn.ensemble import RandomForestRegressor
classifier = RandomForestRegressor()
classifier.fit(X_train.toarray(), y_train)

RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0)

In [18]:
# TASK (6) - evaluate your performance
y_default_test = np.zeros(len(y_test)) + 0.5



In [19]:
y_default_test[:10]

array([ 0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5])

In [20]:
y_hat = classifier.predict(X_test.toarray())

In [21]:
from sklearn.metrics import mean_squared_error

In [23]:
mean_squared_error(y_default_test, y_hat)

0.0047895687436166675

In [None]:
# with 1000 instances: 0.037500130068509996
# random 1000: 0.089035329288528336