In [4]:
import pandas as pd

# Read data from files 
train = pd.read_csv( "./input/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "./input/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "./input/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)
print(
    "Read %d labeled train reviews, %d labeled test reviews, " \
    "and %d unlabeled reviews\n" % (train["review"].size,  
    test["review"].size, unlabeled_train["review"].size)
)

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [2]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [3]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
nltk.download()   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [4]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)



In [5]:
print(len(sentences))

795538


In [9]:
from os.path import exists
model_name = "300features_40minwords_10context"

if not exists(model_name):

    # Import the built-in logging module and configure it so that Word2Vec 
    # creates nice output messages
    import logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    from gensim.models import word2vec
    print("Training model...")
    model = word2vec.Word2Vec(sentences, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling)

    # If you don't plan to train the model any further, calling 
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and 
    # save the model for later use. You can load it later using Word2Vec.load()
    model.save(model_name)

else:
    from gensim.models import Word2Vec
    model = Word2Vec.load(model_name)

In [None]:
model.predict()

In [12]:
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'[a-z]+')
stop_words = set(stopwords.words('english'))

In [14]:
def preprocess(document):
    """
    TODO: write your preprocessing function, including following steps:
    - convert the whole text to the lowercase;
    - tokenize the text;
    - remove stopwords;
    - lemmatize the text.
    Return: string, resulted list of tokens joined with the space.
    """

    document = BeautifulSoup(document).get_text() 

    document = document.lower() # Convert to lowercase
    words = tokenizer.tokenize(document) # Tokenize
    words = [w for w in words if not w in stop_words] # Removing stopwords
    # Lemmatizing
    for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
        words = [wordnet_lemmatizer.lemmatize(x, pos) for x in words]
    return " ".join(words)

train['Processed Review'] = train['review'].apply(preprocess)

train.head()

NameError: name 'train' is not defined

## Using state-of-the-art embeddings

In [15]:
import os
if not os.path.exists("glove.6B.zip"):
    !wget nlp.stanford.edu/data/wordvecs/glove.6B.zip

if not os.path.exists("glove.6B.300d.txt"):
    !unzip glove.6B.zip

In [16]:
if not os.path.exists("gensim_glove_vectors.txt"):
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

In [17]:
%%time

from gensim.models.keyedvectors import KeyedVectors
en_w2v_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

CPU times: user 44.2 s, sys: 611 ms, total: 44.8 s
Wall time: 44.9 s


In [18]:
import numpy as np
import pandas as pd

def get_phrase_embedding(model, phrase):
    """
    Convert phrase to a vector by aggregating it's word embeddings. See description above.
    """
    # 1. lowercase phrase
    # 2. tokenize phrase
    # 3. average word vectors for all words in tokenized phrase
    # skip words that are not in model's vocabulary
    # if all words are missing from vocabulary, return zeros
    
    vector = np.zeros([model.vector_size], dtype='float32')
    
    ### SOLUTION ###
    phrase = BeautifulSoup(phrase).get_text() 
    phrase = phrase.lower()
    tokens = tokenizer.tokenize(phrase)
    used_words = 0
    
    for word in tokens:
        if word in model:
            vector += model.wv[word]
            used_words += 1
    
    if used_words > 0:
        vector = vector / used_words
    ### SOLUTION ###
    
    return vector

In [19]:
w2v_train = pd.read_csv( "./input/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
w2v_test = pd.read_csv( "./input/testData.tsv", header=0, delimiter="\t", quoting=3 )
w2v_unlabeled_train = pd.read_csv( "./input/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

In [20]:
def preprocess(phrase):
    return get_phrase_embedding(en_w2v_model, phrase)

# train['Processed Review'] = train['review'].apply(preprocess)
w2v_train['vector'] = w2v_train['review'].apply(preprocess)
# w2v_train = [preprocess(item) for item in w2v_train['review']]



In [21]:
import sklearn
train_data = list(w2v_train["vector"].apply(list).values)
train_label = w2v_train["sentiment"].to_numpy()

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    train_data,
    train_label, 
    test_size=0.2,
    random_state=241
)

In [23]:
import sklearn.metrics
import sklearn.model_selection

def get_classifier_score(classifier):
    predictions = classifier.predict(X_test)
    score = sklearn.metrics.accuracy_score(y_test, predictions)
    return score

split_generator = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
def cross_validation_mean_score(clf):

    score_arr = sklearn.model_selection.cross_val_score(
        estimator=clf,
        X=train_data,
        y=train_label,
        cv=split_generator,
        scoring='neg_mean_squared_error'
    )
    score = score_arr.mean()
    return score

In [64]:
# from sklearn import svm

# clf = svm.SVC(C=100000, kernel='linear', random_state=241)

# clf.fit(X=X_train, y=y_train)

In [25]:
import xgboost
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
model = GradientBoostingClassifier(max_depth=5)
model.fit(X_train, y_train)

GradientBoostingClassifier(max_depth=5)

In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 50],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 4]
}

grid_searcehd=GradientBoostingClassifier()
model = GridSearchCV(grid_searcehd, param_grid)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.02%


In [40]:
model.best_params_

{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 50}