-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
U-US\gtesei
authored and
U-US\gtesei
committed
Mar 18, 2016
1 parent
d4ff881
commit 71790ea
Showing
4 changed files
with
511 additions
and
511 deletions.
There are no files selected for viewing
226 changes: 113 additions & 113 deletions
226
doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/BagOfWords.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,113 +1,113 @@ | ||
#!/usr/bin/env python | ||
|
||
# Author: Angela Chapman | ||
# Date: 8/6/2014 | ||
# | ||
# This file contains code to accompany the Kaggle tutorial | ||
# "Deep learning goes to the movies". The code in this file | ||
# is for Part 1 of the tutorial on Natural Language Processing. | ||
# | ||
# *************************************** # | ||
|
||
import os | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.ensemble import RandomForestClassifier | ||
from KaggleWord2VecUtility import KaggleWord2VecUtility | ||
import pandas as pd | ||
import numpy as np | ||
import nltk | ||
|
||
if __name__ == '__main__': | ||
if os.name=='posix': | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
else: | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
|
||
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) | ||
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 ) | ||
|
||
print('The first review is:') | ||
print(train["review"][0]) | ||
|
||
#raw_input("Press Enter to continue...") | ||
|
||
|
||
print('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...') | ||
#nltk.download() # Download text data sets, including stop words | ||
|
||
# Initialize an empty list to hold the clean reviews | ||
clean_train_reviews = [] | ||
|
||
# Loop over each review; create an index i that goes from 0 to the length | ||
# of the movie review list | ||
|
||
print("Cleaning and parsing the training set movie reviews...\n") | ||
for i in range( 0, len(train["review"])): | ||
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) | ||
|
||
|
||
# ****** Create a bag of words from the training set | ||
# | ||
print("Creating the bag of words...\n") | ||
|
||
|
||
# Initialize the "CountVectorizer" object, which is scikit-learn's | ||
# bag of words tool. | ||
vectorizer = CountVectorizer(analyzer="word", | ||
tokenizer=None, | ||
preprocessor=None, | ||
stop_words=None, | ||
max_features=5000) | ||
|
||
# fit_transform() does two functions: First, it fits the model | ||
# and learns the vocabulary; second, it transforms our training data | ||
# into feature vectors. The input to fit_transform should be a list of | ||
# strings. | ||
train_data_features = vectorizer.fit_transform(clean_train_reviews) | ||
|
||
# Numpy arrays are easy to work with, so convert the result to an | ||
# array | ||
train_data_features = train_data_features.toarray() | ||
|
||
# ******* Train a random forest using the bag of words | ||
# | ||
print("Training the random forest (this may take a while)...") | ||
|
||
|
||
# Initialize a Random Forest classifier with 100 trees | ||
forest = RandomForestClassifier(n_estimators=100) | ||
|
||
# Fit the forest to the training set, using the bag of words as | ||
# features and the sentiment labels as the response variable | ||
# | ||
# This may take a few minutes to run | ||
forest = forest.fit(train_data_features, train["sentiment"] ) | ||
|
||
|
||
|
||
# Create an empty list and append the clean reviews one by one | ||
clean_test_reviews = [] | ||
|
||
print("Cleaning and parsing the test set movie reviews...\n") | ||
for i in range(0,len(test["review"])): | ||
clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) | ||
|
||
# Get a bag of words for the test set, and convert to a numpy array | ||
test_data_features = vectorizer.transform(clean_test_reviews) | ||
test_data_features = test_data_features.toarray() | ||
|
||
# Use the random forest to make sentiment label predictions | ||
print("Predicting test labels...\n") | ||
result = forest.predict(test_data_features) | ||
|
||
# Copy the results to a pandas dataframe with an "id" column and | ||
# a "sentiment" column | ||
output = pd.DataFrame(data={"id":test["id"],"sentiment":result}) | ||
|
||
# Use pandas to write the comma-separated output file | ||
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3) | ||
print("Wrote results to Bag_of_Words_model.csv") | ||
|
||
|
||
#!/usr/bin/env python | ||
|
||
# Author: Angela Chapman | ||
# Date: 8/6/2014 | ||
# | ||
# This file contains code to accompany the Kaggle tutorial | ||
# "Deep learning goes to the movies". The code in this file | ||
# is for Part 1 of the tutorial on Natural Language Processing. | ||
# | ||
# *************************************** # | ||
|
||
import os | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.ensemble import RandomForestClassifier | ||
from KaggleWord2VecUtility import KaggleWord2VecUtility | ||
import pandas as pd | ||
import numpy as np | ||
import nltk | ||
|
||
if __name__ == '__main__': | ||
if os.name=='posix': | ||
print('>>> Loading Mac OSX env ...') | ||
os.chdir('/Users/gino/kaggle/fast-furious/gitHub/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
else: | ||
print('>>> Loading Windows env ...') | ||
os.chdir('C:/Machine_Learning/git/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/') | ||
|
||
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) | ||
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 ) | ||
|
||
print('The first review is:') | ||
print(train["review"][0]) | ||
|
||
#raw_input("Press Enter to continue...") | ||
|
||
|
||
print('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...') | ||
#nltk.download() # Download text data sets, including stop words | ||
|
||
# Initialize an empty list to hold the clean reviews | ||
clean_train_reviews = [] | ||
|
||
# Loop over each review; create an index i that goes from 0 to the length | ||
# of the movie review list | ||
|
||
print("Cleaning and parsing the training set movie reviews...\n") | ||
for i in range( 0, len(train["review"])): | ||
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) | ||
|
||
|
||
# ****** Create a bag of words from the training set | ||
# | ||
print("Creating the bag of words...\n") | ||
|
||
|
||
# Initialize the "CountVectorizer" object, which is scikit-learn's | ||
# bag of words tool. | ||
vectorizer = CountVectorizer(analyzer="word", | ||
tokenizer=None, | ||
preprocessor=None, | ||
stop_words=None, | ||
max_features=5000) | ||
|
||
# fit_transform() does two functions: First, it fits the model | ||
# and learns the vocabulary; second, it transforms our training data | ||
# into feature vectors. The input to fit_transform should be a list of | ||
# strings. | ||
train_data_features = vectorizer.fit_transform(clean_train_reviews) | ||
|
||
# Numpy arrays are easy to work with, so convert the result to an | ||
# array | ||
train_data_features = train_data_features.toarray() | ||
|
||
# ******* Train a random forest using the bag of words | ||
# | ||
print("Training the random forest (this may take a while)...") | ||
|
||
|
||
# Initialize a Random Forest classifier with 100 trees | ||
forest = RandomForestClassifier(n_estimators=100) | ||
|
||
# Fit the forest to the training set, using the bag of words as | ||
# features and the sentiment labels as the response variable | ||
# | ||
# This may take a few minutes to run | ||
forest = forest.fit(train_data_features, train["sentiment"] ) | ||
|
||
|
||
|
||
# Create an empty list and append the clean reviews one by one | ||
clean_test_reviews = [] | ||
|
||
print("Cleaning and parsing the test set movie reviews...\n") | ||
for i in range(0,len(test["review"])): | ||
clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) | ||
|
||
# Get a bag of words for the test set, and convert to a numpy array | ||
test_data_features = vectorizer.transform(clean_test_reviews) | ||
test_data_features = test_data_features.toarray() | ||
|
||
# Use the random forest to make sentiment label predictions | ||
print("Predicting test labels...\n") | ||
result = forest.predict(test_data_features) | ||
|
||
# Copy the results to a pandas dataframe with an "id" column and | ||
# a "sentiment" column | ||
output = pd.DataFrame(data={"id":test["id"],"sentiment":result}) | ||
|
||
# Use pandas to write the comma-separated output file | ||
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3) | ||
print("Wrote results to Bag_of_Words_model.csv") | ||
|
||
116 changes: 58 additions & 58 deletions
116
doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/KaggleWord2VecUtility.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,58 +1,58 @@ | ||
#!/usr/bin/env python | ||
|
||
import re | ||
import nltk | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from bs4 import BeautifulSoup | ||
from nltk.corpus import stopwords | ||
|
||
|
||
class KaggleWord2VecUtility(object): | ||
"""KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning""" | ||
|
||
@staticmethod | ||
def review_to_wordlist( review, remove_stopwords=False ): | ||
# Function to convert a document to a sequence of words, | ||
# optionally removing stop words. Returns a list of words. | ||
# | ||
# 1. Remove HTML | ||
review_text = BeautifulSoup(review).get_text() | ||
# | ||
# 2. Remove non-letters | ||
review_text = re.sub("[^a-zA-Z]"," ", review_text) | ||
# | ||
# 3. Convert words to lower case and split them | ||
words = review_text.lower().split() | ||
# | ||
# 4. Optionally remove stop words (false by default) | ||
if remove_stopwords: | ||
stops = set(stopwords.words("english")) | ||
words = [w for w in words if not w in stops] | ||
# | ||
# 5. Return a list of words | ||
return(words) | ||
|
||
# Define a function to split a review into parsed sentences | ||
@staticmethod | ||
def review_to_sentences( review, tokenizer, remove_stopwords=False ): | ||
# Function to split a review into parsed sentences. Returns a | ||
# list of sentences, where each sentence is a list of words | ||
# | ||
# 1. Use the NLTK tokenizer to split the paragraph into sentences | ||
#raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) | ||
raw_sentences = tokenizer.tokenize(review.strip()) | ||
# | ||
# 2. Loop over each sentence | ||
sentences = [] | ||
for raw_sentence in raw_sentences: | ||
# If a sentence is empty, skip it | ||
if len(raw_sentence) > 0: | ||
# Otherwise, call review_to_wordlist to get a list of words | ||
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence,remove_stopwords )) | ||
# | ||
# Return the list of sentences (each sentence is a list of words, | ||
# so this returns a list of lists | ||
return sentences | ||
#!/usr/bin/env python | ||
|
||
import re | ||
import nltk | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from bs4 import BeautifulSoup | ||
from nltk.corpus import stopwords | ||
|
||
|
||
class KaggleWord2VecUtility(object): | ||
"""KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning""" | ||
|
||
@staticmethod | ||
def review_to_wordlist( review, remove_stopwords=False ): | ||
# Function to convert a document to a sequence of words, | ||
# optionally removing stop words. Returns a list of words. | ||
# | ||
# 1. Remove HTML | ||
review_text = BeautifulSoup(review).get_text() | ||
# | ||
# 2. Remove non-letters | ||
review_text = re.sub("[^a-zA-Z]"," ", review_text) | ||
# | ||
# 3. Convert words to lower case and split them | ||
words = review_text.lower().split() | ||
# | ||
# 4. Optionally remove stop words (false by default) | ||
if remove_stopwords: | ||
stops = set(stopwords.words("english")) | ||
words = [w for w in words if not w in stops] | ||
# | ||
# 5. Return a list of words | ||
return(words) | ||
|
||
# Define a function to split a review into parsed sentences | ||
@staticmethod | ||
def review_to_sentences( review, tokenizer, remove_stopwords=False ): | ||
# Function to split a review into parsed sentences. Returns a | ||
# list of sentences, where each sentence is a list of words | ||
# | ||
# 1. Use the NLTK tokenizer to split the paragraph into sentences | ||
#raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) | ||
raw_sentences = tokenizer.tokenize(review.strip()) | ||
# | ||
# 2. Loop over each sentence | ||
sentences = [] | ||
for raw_sentence in raw_sentences: | ||
# If a sentence is empty, skip it | ||
if len(raw_sentence) > 0: | ||
# Otherwise, call review_to_wordlist to get a list of words | ||
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence,remove_stopwords )) | ||
# | ||
# Return the list of sentences (each sentence is a list of words, | ||
# so this returns a list of lists | ||
return sentences |
Oops, something went wrong.