-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
518 additions
and
0 deletions.
There are no files selected for viewing
113 changes: 113 additions & 0 deletions
113
doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/BagOfWords.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#!/usr/bin/env python | ||
|
||
# Author: Angela Chapman | ||
# Date: 8/6/2014 | ||
# | ||
# This file contains code to accompany the Kaggle tutorial | ||
# "Deep learning goes to the movies". The code in this file | ||
# is for Part 1 of the tutorial on Natural Language Processing. | ||
# | ||
# *************************************** # | ||
|
||
import os | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.ensemble import RandomForestClassifier | ||
from KaggleWord2VecUtility import KaggleWord2VecUtility | ||
import pandas as pd | ||
import numpy as np | ||
import nltk | ||
|
||
if __name__ == '__main__': | ||
if os.name=='posix': | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
else: | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
|
||
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) | ||
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 ) | ||
|
||
print('The first review is:') | ||
print(train["review"][0]) | ||
|
||
#raw_input("Press Enter to continue...") | ||
|
||
|
||
print('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...') | ||
#nltk.download() # Download text data sets, including stop words | ||
|
||
# Initialize an empty list to hold the clean reviews | ||
clean_train_reviews = [] | ||
|
||
# Loop over each review; create an index i that goes from 0 to the length | ||
# of the movie review list | ||
|
||
print("Cleaning and parsing the training set movie reviews...\n") | ||
for i in range( 0, len(train["review"])): | ||
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) | ||
|
||
|
||
# ****** Create a bag of words from the training set | ||
# | ||
print("Creating the bag of words...\n") | ||
|
||
|
||
# Initialize the "CountVectorizer" object, which is scikit-learn's | ||
# bag of words tool. | ||
vectorizer = CountVectorizer(analyzer="word", | ||
tokenizer=None, | ||
preprocessor=None, | ||
stop_words=None, | ||
max_features=5000) | ||
|
||
# fit_transform() does two functions: First, it fits the model | ||
# and learns the vocabulary; second, it transforms our training data | ||
# into feature vectors. The input to fit_transform should be a list of | ||
# strings. | ||
train_data_features = vectorizer.fit_transform(clean_train_reviews) | ||
|
||
# Numpy arrays are easy to work with, so convert the result to an | ||
# array | ||
train_data_features = train_data_features.toarray() | ||
|
||
# ******* Train a random forest using the bag of words | ||
# | ||
print("Training the random forest (this may take a while)...") | ||
|
||
|
||
# Initialize a Random Forest classifier with 100 trees | ||
forest = RandomForestClassifier(n_estimators=100) | ||
|
||
# Fit the forest to the training set, using the bag of words as | ||
# features and the sentiment labels as the response variable | ||
# | ||
# This may take a few minutes to run | ||
forest = forest.fit(train_data_features, train["sentiment"] ) | ||
|
||
|
||
|
||
# Create an empty list and append the clean reviews one by one | ||
clean_test_reviews = [] | ||
|
||
print("Cleaning and parsing the test set movie reviews...\n") | ||
for i in range(0,len(test["review"])): | ||
clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) | ||
|
||
# Get a bag of words for the test set, and convert to a numpy array | ||
test_data_features = vectorizer.transform(clean_test_reviews) | ||
test_data_features = test_data_features.toarray() | ||
|
||
# Use the random forest to make sentiment label predictions | ||
print("Predicting test labels...\n") | ||
result = forest.predict(test_data_features) | ||
|
||
# Copy the results to a pandas dataframe with an "id" column and | ||
# a "sentiment" column | ||
output = pd.DataFrame(data={"id":test["id"],"sentiment":result}) | ||
|
||
# Use pandas to write the comma-separated output file | ||
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3) | ||
print("Wrote results to Bag_of_Words_model.csv") | ||
|
||
|
58 changes: 58 additions & 0 deletions
58
doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/KaggleWord2VecUtility.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python | ||
|
||
import re | ||
import nltk | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from bs4 import BeautifulSoup | ||
from nltk.corpus import stopwords | ||
|
||
|
||
class KaggleWord2VecUtility(object): | ||
"""KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning""" | ||
|
||
@staticmethod | ||
def review_to_wordlist( review, remove_stopwords=False ): | ||
# Function to convert a document to a sequence of words, | ||
# optionally removing stop words. Returns a list of words. | ||
# | ||
# 1. Remove HTML | ||
review_text = BeautifulSoup(review).get_text() | ||
# | ||
# 2. Remove non-letters | ||
review_text = re.sub("[^a-zA-Z]"," ", review_text) | ||
# | ||
# 3. Convert words to lower case and split them | ||
words = review_text.lower().split() | ||
# | ||
# 4. Optionally remove stop words (false by default) | ||
if remove_stopwords: | ||
stops = set(stopwords.words("english")) | ||
words = [w for w in words if not w in stops] | ||
# | ||
# 5. Return a list of words | ||
return(words) | ||
|
||
# Define a function to split a review into parsed sentences | ||
@staticmethod | ||
def review_to_sentences( review, tokenizer, remove_stopwords=False ): | ||
# Function to split a review into parsed sentences. Returns a | ||
# list of sentences, where each sentence is a list of words | ||
# | ||
# 1. Use the NLTK tokenizer to split the paragraph into sentences | ||
#raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) | ||
raw_sentences = tokenizer.tokenize(review.strip()) | ||
# | ||
# 2. Loop over each sentence | ||
sentences = [] | ||
for raw_sentence in raw_sentences: | ||
# If a sentence is empty, skip it | ||
if len(raw_sentence) > 0: | ||
# Otherwise, call review_to_wordlist to get a list of words | ||
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence,remove_stopwords )) | ||
# | ||
# Return the list of sentences (each sentence is a list of words, | ||
# so this returns a list of lists | ||
return sentences |
185 changes: 185 additions & 0 deletions
185
doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/Word2Vec_AverageVectors.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
#!/usr/bin/env python | ||
|
||
# Author: Angela Chapman | ||
# Date: 8/6/2014 | ||
# | ||
# This file contains code to accompany the Kaggle tutorial | ||
# "Deep learning goes to the movies". The code in this file | ||
# is for Parts 2 and 3 of the tutorial, which cover how to | ||
# train a model using Word2Vec. | ||
# | ||
# *************************************** # | ||
|
||
|
||
# ****** Read the two training sets and the test set | ||
# | ||
import pandas as pd | ||
import os | ||
from nltk.corpus import stopwords | ||
import nltk.data | ||
import logging | ||
import numpy as np # Make sure that numpy is imported | ||
from gensim.models import Word2Vec | ||
from sklearn.ensemble import RandomForestClassifier | ||
|
||
from KaggleWord2VecUtility import KaggleWord2VecUtility | ||
|
||
|
||
# ****** Define functions to create average word vectors | ||
# | ||
|
||
def makeFeatureVec(words, model, num_features): | ||
# Function to average all of the word vectors in a given | ||
# paragraph | ||
# | ||
# Pre-initialize an empty numpy array (for speed) | ||
featureVec = np.zeros((num_features,),dtype="float32") | ||
# | ||
nwords = 0. | ||
# | ||
# Index2word is a list that contains the names of the words in | ||
# the model's vocabulary. Convert it to a set, for speed | ||
index2word_set = set(model.index2word) | ||
# | ||
# Loop over each word in the review and, if it is in the model's | ||
# vocaublary, add its feature vector to the total | ||
for word in words: | ||
if word in index2word_set: | ||
nwords = nwords + 1. | ||
featureVec = np.add(featureVec,model[word]) | ||
# | ||
# Divide the result by the number of words to get the average | ||
featureVec = np.divide(featureVec,nwords) | ||
return featureVec | ||
|
||
|
||
def getAvgFeatureVecs(reviews, model, num_features): | ||
# Given a set of reviews (each one a list of words), calculate | ||
# the average feature vector for each one and return a 2D numpy array | ||
# | ||
# Initialize a counter | ||
counter = 0. | ||
# | ||
# Preallocate a 2D numpy array, for speed | ||
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32") | ||
# | ||
# Loop through the reviews | ||
for review in reviews: | ||
# | ||
# Print a status message every 1000th review | ||
if counter%1000. == 0.: | ||
print("Review %d of %d" % (counter, len(reviews))) | ||
# | ||
# Call the function (defined above) that makes average feature vectors | ||
reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features) | ||
# | ||
# Increment the counter | ||
counter = counter + 1. | ||
return reviewFeatureVecs | ||
|
||
|
||
def getCleanReviews(reviews): | ||
clean_reviews = [] | ||
for review in reviews["review"]: | ||
clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True )) | ||
return clean_reviews | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
|
||
if os.name=='posix': | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
else: | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
|
||
# Read data from files | ||
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) | ||
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 ) | ||
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3) | ||
|
||
# Verify the number of reviews that were read (100,000 in total) | ||
print("Read %d labeled train reviews, %d labeled test reviews, " | ||
"and %d unlabeled reviews\n" % (train["review"].size, | ||
test["review"].size, unlabeled_train["review"].size)) | ||
|
||
# Load the punkt tokenizer | ||
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | ||
|
||
# ****** Split the labeled and unlabeled training sets into clean sentences | ||
# | ||
sentences = [] # Initialize an empty list of sentences | ||
|
||
print("Parsing sentences from training set") | ||
for review in train["review"]: | ||
sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) | ||
|
||
print("Parsing sentences from unlabeled set") | ||
for review in unlabeled_train["review"]: | ||
sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) | ||
|
||
# ****** Set parameters and train the word2vec model | ||
# | ||
# Import the built-in logging module and configure it so that Word2Vec | ||
# creates nice output messages | ||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO) | ||
|
||
# Set values for various parameters | ||
num_features = 300 # Word vector dimensionality | ||
min_word_count = 40 # Minimum word count | ||
num_workers = 4 # Number of threads to run in parallel | ||
context = 10 # Context window size | ||
downsampling = 1e-3 # Downsample setting for frequent words | ||
|
||
# Initialize and train the model (this will take some time) | ||
print("Training Word2Vec model...") | ||
model = Word2Vec(sentences, workers=num_workers, | ||
size=num_features, min_count = min_word_count, | ||
window = context, sample = downsampling, seed=1) | ||
|
||
# If you don't plan to train the model any further, calling | ||
# init_sims will make the model much more memory-efficient. | ||
model.init_sims(replace=True) | ||
|
||
# It can be helpful to create a meaningful model name and | ||
# save the model for later use. You can load it later using Word2Vec.load() | ||
model_name = "300features_40minwords_10context.tsv" | ||
model.save(model_name) | ||
|
||
model.doesnt_match("man woman child kitchen".split()) | ||
model.doesnt_match("france england germany berlin".split()) | ||
model.doesnt_match("paris berlin london austria".split()) | ||
model.most_similar("man") | ||
model.most_similar("queen") | ||
model.most_similar("awful") | ||
|
||
|
||
|
||
# ****** Create average vectors for the training and test sets | ||
# | ||
print("Creating average feature vecs for training reviews") | ||
|
||
trainDataVecs = getAvgFeatureVecs( getCleanReviews(train), model, num_features ) | ||
|
||
print("Creating average feature vecs for test reviews") | ||
|
||
testDataVecs = getAvgFeatureVecs( getCleanReviews(test), model, num_features ) | ||
|
||
|
||
# ****** Fit a random forest to the training set, then make predictions | ||
# | ||
# Fit a random forest to the training data, using 100 trees | ||
forest = RandomForestClassifier( n_estimators = 100 ) | ||
|
||
print("Fitting a random forest to labeled training data...") | ||
forest = forest.fit(trainDataVecs, train["sentiment"]) | ||
|
||
# Test & extract results | ||
result = forest.predict(testDataVecs) | ||
|
||
# Write the test results | ||
output = pd.DataFrame( data={"id":test["id"], "sentiment":result}) | ||
output.to_csv("Word2Vec_AverageVectors.tsv", index=False, quoting=3 ) | ||
print("Wrote Word2Vec_AverageVectors.csv") |
Oops, something went wrong.