```bash
# install fastText
git clone https://github.com/facebookresearch/fastText.git
cd fastText
make

# upgrade tensorflow to version 1.2.1 for python3 on mac
pip install --upgrade \
 https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.1-py3-none-any.whl
 
pip install dask_searchcv
```

# Text Classification with Word2vec



http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [1]:
import os
import numpy as np
import pandas as pd
from subprocess import call
from joblib import cpu_count
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter('ignore')
from gensim.models import Word2Vec

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,joblib,matplotlib,gensim,xgboost

Using TensorFlow backend.


Ethen 2017-07-17 16:11:40 

CPython 3.5.2
IPython 5.4.1

numpy 1.13.1
pandas 0.20.2
sklearn 0.18.1
joblib 0.11
matplotlib 2.0.2
gensim 2.2.0
xgboost 0.6


In [2]:
def get_data():
    """download Reuters' text categorization benchmarks from its url"""
    
    train_data = 'r8-train-no-stop.txt'
    test_data = 'r8-test-no-stop.txt'
    concat_data = 'r8-no-stop.txt'
    base_url = 'http://www.cs.umb.edu/~smimarog/textmining/datasets/'
    
    # brew install wget
    # on a mac if you don't have it
    if not os.path.isfile(train_data):
        call('wget ' + base_url + train_data, shell = True)

    if not os.path.isfile(test_data):
        call('wget ' + base_url + test_data, shell = True)

    if not os.path.isfile(concat_data):
        # concatenate train and test files, we'll make our own train-test splits
        # the > piping symbol directs the concatenated file to a new file, it
        # will replace the file if it already exists; on the other hand, the >> symbol
        # will append if it already exists
        call('cat r8-*-no-stop.txt > ' + concat_data, shell = True)


get_data()

In [3]:
def download_glove():
    """download GloVe word vector representations, this step may take a while"""

    # bunch of small embeddings - trained on 6B tokens - 822 MB download, 2GB unzipped
    glove_small = 'glove.6B.zip'
    base_url = 'http://nlp.stanford.edu/data/'
    if not os.path.isfile(glove_small):
        call('wget ' + base_url + glove_small, shell = True)
        call('unzip ' + glove_small, shell = True)

    # and a single behemoth - trained on 840B tokens - 2GB compressed, 5GB unzipped
    # glove_big = 'glove.840B.300d.zip'
    # if not os.path.isfile(glove_big):
    #    call('wget ' + base_url + glove_big, shell = True)
    #    call('unzip ' + glove_big, shell = True)

        
download_glove()

In [4]:
DATA_PATH = "r8-no-stop.txt"
GLOVE_6B_50D_PATH = "glove.6B.50d.txt"
# GLOVE_840B_300D_PATH = "glove.840B.300d.txt"

In [5]:
# TODO : include spacy ???

X, y = [], []
with open(DATA_PATH) as infile:
    for line in infile:
        label, text = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        X.append(text.split())
        y.append(label)


X, y = np.asarray(X), np.asarray(y)
print('total examples {}'.format(len(y)))

total examples 7674


In [6]:
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 1234, stratify = y)

In [7]:
# workers = cpu_count()
# word2vec = Word2Vec(
#     X_train_text, size = 100, window = 5, min_count = 5, workers = workers)

# # If we’re finished training a model (i.e. no more updates, only querying)
# # we can store the wordvectors and delete the model to trim unneeded model memory
# word_vectors = word2vec.wv
# del word2vec


# w2v = {w: vec for w, vec in zip(word_vectors.index2word, word_vectors.syn0)}
# len(w2v)

The Word2Vec model consists of a feature vector for each word in the vocabulary, stored in a numpy array called `syn0`. The number of rows in syn0 is the number of words in the model's vocabulary, and the number of columns corresponds to the size of the feature vector (dimensionality of the feature vectors).

In [9]:
glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH) as f:
    for line in f:
        splitted = line.split()
        word = splitted[0]
        vector = [float(x) for x in splitted[1:]]
        if word in all_words:
            glove_small[word] = np.array(vector)
            
len(glove_small)

18935

In [10]:
# import dask_searchcv as dcv
# from main import Word2Vectorizer
# from scipy.stats import randint, uniform

# # specify the pipeline and the parameter that's going
# # be tuned in the pipeline
# word2vec = Word2Vectorizer(
#     size = 100, window = 5, min_count = 5)

# # for xgboost, set number of estimator to a large number
# # and the learning rate to be a small number, we'll simply 
# # let early stopping decide when to stop training;

# # word2vec approach was to reduce the dimensionality of the problem
# # so that those superior methods can be effectively used
# xgb = XGBClassifier(learning_rate = 0.05, n_estimators = 80, n_jobs = -1)
# w2v_xgb = Pipeline([
#     ('w2v', word2vec), 
#     ('xgb', xgb)
# ])

# # when setting parameters of the various steps in a Pipeline,
# # we can use the name (first element in the tuple) and 
# # the parameter name separated by '__'

# # subsample, colsample_bytree and max_depth are presumably the most
# # common way to control under/overfitting for tree-based models
# w2v_xgb_params = {'w2v__tfidf': [True, False],
#                   'xgb__max_depth': randint(low = 3, high = 12),
#                   'xgb__colsample_bytree': uniform(loc = 0.8, scale = 0.2),
#                   'xgb__subsample': uniform(loc = 0.8, scale = 0.2)}

# name, model, params = ('w2v_xgb', w2v_xgb, w2v_xgb_params)

# # a drop-in replacement for scikit-learn's RandomSearchCV that's
# # more efficient at doing Pipeline parameter tuning
# # http://dask-searchcv.readthedocs.io/en/latest/;

# # note that in scikit-learn 0.19 the Pipeline also has a parameter
# # that allows you to cache the intermediate process, which might
# # reduce the need of this extra package
# # http://scikit-learn.org/dev/modules/pipeline.html#caching-transformers-avoid-repeated-computation
# random_search = dcv.RandomizedSearchCV(model, params, cv = 3)
# random_search.fit(X_train_text, y_train)

In [11]:
# specify all possible pipelines
from main import Word2Vectorizer
from scipy.stats import randint, uniform

# TODO: ??? chi2 test for controlling max_features
tfidf = TfidfVectorizer(analyzer = lambda x: x,
                        stop_words = 'english')

# the word2vec approach was to reduce the dimensionality of
# the problem so that those superior methods can be effectively used
word2vec = Word2Vectorizer(size = 100, window = 5, min_count = 5)

# SVM is particularly well suited for problems with very 
# highly dimensional, sparse feature vectors, such as text data
svm = LinearSVC()

# logistic regression is also known for its interpretability and
# fast training time
logistic = LogisticRegression()

# for xgboost, set number of estimator to a large number
# and the learning rate to be a small number, we'll simply 
# let early stopping decide when to stop training;
xgb = XGBClassifier(learning_rate = 0.05, n_estimators = 80, n_jobs = -1)

tfidf_svm = Pipeline([
    ('tfidf', tfidf),
    ('svm', svm)
])

tfidf_logistic = Pipeline([
    ('tfidf', tfidf),
    ('logistic', logistic)
])

w2v_xgb = Pipeline([
    ('w2v', word2vec), 
    ('xgb', xgb)
])

w2v_svm = Pipeline([
    ('w2v', word2vec), 
    ('svm', svm)
])

w2v_logistic = Pipeline([
    ('w2v', word2vec), 
    ('logistic', logistic)
])

glove_pretrained = Word2Vectorizer(w2v = GLOVE_6B_50D_PATH)
glove_pretrained_xgb = Pipeline([
    ('w2v', glove_pretrained), 
    ('xgb', xgb)
])

In [12]:
# specify the parameter that's going
# be tuned in all the pipelines

# hyperparameters' dictionary:
# when setting parameters of the various steps in a Pipeline,
# we can use the name (first element in the tuple) and 
# the parameter name separated by '__'
tfidf_params = {'tfidf__sublinear_tf': [True, False],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]}
logistic_params = {'logistic__C': uniform(loc = 1, scale = 0.5)}
svm_params = {'svm__loss': ['hinge', 'squared_hinge']}
w2v_params = {'w2v__tfidf': [True, False]}

# subsample, colsample_bytree and max_depth are presumably the most
# common way to control under/overfitting for tree-based models
xgb_params = {'xgb__max_depth': randint(low = 3, high = 12),
              'xgb__colsample_bytree': uniform(loc = 0.8, scale = 0.2),
              'xgb__subsample': uniform(loc = 0.8, scale = 0.2)}

tfidf_svm_params = {**tfidf_params, **svm_params}
tfidf_logistic_params = {**tfidf_params, **logistic_params}
w2v_svm_params = {**w2v_params, **svm_params}
w2v_xgb_params = {**w2v_params, **xgb_params}
w2v_logistic_params = {**w2v_params, **logistic_params}

In [13]:
import dask_searchcv as dcv

cv = 3
n_iter = 3
all_models = [
    ('tfidf_svm', tfidf_svm, tfidf_svm_params),
    ('tfidf_logistic', tfidf_logistic, tfidf_logistic_params),
    ('w2v_svm', w2v_svm, w2v_svm_params),
    ('w2v_xgb', w2v_xgb, w2v_xgb_params),
    ('w2v_logistic', w2v_logistic, w2v_logistic_params),
    ('glove_pretrained_xgb', glove_pretrained_xgb, w2v_xgb_params)
]

all_models_info = []
for name, model, params in all_models:
    # a drop-in replacement for scikit-learn's RandomSearchCV that's
    # more efficient at doing Pipeline parameter tuning
    # http://dask-searchcv.readthedocs.io/en/latest/

    # note that in scikit-learn 0.19 the Pipeline also has a parameter
    # that allows you to cache the intermediate process, which might
    # reduce the need of this extra package
    # http://scikit-learn.org/dev/modules/pipeline.html#caching-transformers-avoid-repeated-computation
    random_search = dcv.RandomizedSearchCV(model, params, cv = cv, n_iter = n_iter)
    random_search.fit(X_train_text, y_train)
    info = name, random_search.best_score_, random_search
    all_models_info.append(info)

results = pd.DataFrame(all_models_info, columns = ['model_name', 'score', 'estimator'])
results = (results
           .sort_values('score', ascending = False)
           .reset_index(drop = True))
results

Unnamed: 0,model_name,score,estimator
0,tfidf_svm,0.97296,"RandomizedSearchCV(cache_cv=True, cv=3, error_..."
1,tfidf_logistic,0.942336,"RandomizedSearchCV(cache_cv=True, cv=3, error_..."
2,w2v_svm,0.943476,"RandomizedSearchCV(cache_cv=True, cv=3, error_..."
3,w2v_xgb,0.93468,"RandomizedSearchCV(cache_cv=True, cv=3, error_..."
4,w2v_logistic,0.911386,"RandomizedSearchCV(cache_cv=True, cv=3, error_..."
5,glove_pretrained_xgb,0.924906,"RandomizedSearchCV(cache_cv=True, cv=3, error_..."


# Kaggle popcorn

In [32]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
data_path = os.path.join('data', 'labeledTrainData.tsv')
data = pd.read_csv(data_path, delimiter = '\t')
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [34]:
label_col = 'sentiment'
feature_col = 'review'

X_train, X_val = train_test_split(data, test_size = 0.2, 
                                  random_state = 1234,
                                  stratify = data[label_col])

X_train_text, X_val_text = X_train[feature_col], X_val[feature_col]
y_train, y_val = X_train[label_col].values, X_val[label_col].values

In [35]:
X_train_text

9823     Five years after the original Creepshow, anoth...
15954    OK - you want to test somebody on how comforta...
21970    Larry Burrows has the distinct feeling he's mi...
4005     I don't know much about film-making, but good ...
22769    I really have problems rating this movie. It i...
13377    This \film\" attempts to follow the genre of l...
6757     Belushi at his most ingratiating and Courtney ...
6772     I rented Zero Day from the local video store l...
18205    I saw 'New York: I Love You' today and loved i...
5105     This is full of major spoilers, so beware.<br ...
7019     Why do I watch movies like this ? - other than...
12886    Hal Hartley's Henry Fool was an independent fi...
9129     I'm an incorrigible skeptic and agnostic and w...
11594    i found it highly intellectual and artistic in...
3731     The best Laurel and Hardy shorts are filled to...
15297    The exploding zeppelins crashing down upon 'Sk...
18701    With the releasing of \Farligt förflutet\" Swe.

In [4]:
vect = TfidfVectorizer(
    stop_words = 'english', max_features = 40000, ngram_range = (1, 3), sublinear_tf = True)

X_train = vect.fit_transform(X_train_text)
X_val = vect.transform(X_val_text)
X_train

<20000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 2173373 stored elements in Compressed Sparse Row format>

In [5]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

logistic_proba_train = logistic.predict_proba(X_train)[:, 1]
logistic_auc_train = roc_auc_score(y_train, logistic_proba_train)
logistic_proba_val = logistic.predict_proba(X_val)[:, 1]
logistic_auc_val = roc_auc_score(y_val, logistic_proba_val)

print('logistic AUC train:', logistic_auc_train)
print('logistic AUC val:', logistic_auc_val)

logistic AUC train: 0.98719432
logistic AUC val: 0.95929152


In [6]:
testdata_path = os.path.join('data', 'testData.tsv')
df_test = pd.read_csv(testdata_path, delimiter = '\t')
df_test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [7]:
X_test = vect.transform(df_test[feature_col])
logistic_proba_val = logistic.predict_proba(X_test)[:, 1]
df_test['sentiment'] = logistic_proba_val
df_test = df_test.drop(feature_col, axis = 1)
df_test.to_csv('submission.csv', index = False)