win2

gtesei · Mar 18, 2016 · c7fa373 · c7fa373
1 parent 71790ea
commit c7fa373
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 10 deletions.
diff --git a/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/BagOfWords.py b/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/BagOfWords.py
@@ -107,7 +107,7 @@
     output = pd.DataFrame(data={"id":test["id"],"sentiment":result})
 
     # Use pandas to write the comma-separated output file
-    output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3)
+    output.to_csv('Bag_of_Words_model.tsv', index=False, quoting=3)
     print("Wrote results to Bag_of_Words_model.csv")
 
 
diff --git a/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/Word2Vec_AverageVectors.py b/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/Word2Vec_AverageVectors.py
@@ -52,6 +52,8 @@ def makeFeatureVec(words, model, num_features):
     featureVec = np.divide(featureVec,nwords)
     return featureVec
 
+def hash32(value):
+     return hash(value) & 0xffffffff
 
 def getAvgFeatureVecs(reviews, model, num_features):
     # Given a set of reviews (each one a list of words), calculate
@@ -137,7 +139,7 @@ def getCleanReviews(reviews):
     print("Training Word2Vec model...")
     model = Word2Vec(sentences, workers=num_workers,
                 size=num_features, min_count = min_word_count,
-                window = context, sample = downsampling, seed=1)
+                window = context, sample=downsampling, seed=1 , hashfxn=hash32)
 
     # If you don't plan to train the model any further, calling
     # init_sims will make the model much more memory-efficient.

diff --git a/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/Word2Vec_BagOfCentroids.py b/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/Word2Vec_BagOfCentroids.py
@@ -58,7 +58,7 @@ def create_bag_of_centroids( wordlist, word_centroid_map ):
        print('>>> Loading Windows env ...')
        os.chdir('C:/Machine_Learning/git/fast-furious/doc_ref/NLP/word2vec-nlp-tutorial/')
 
-    model = Word2Vec.load("300features_40minwords_10context")
+    model = Word2Vec.load("300features_40minwords_10context.tsv")
 
 
     # ****** Run k-means on the word vectors and print a few clusters
@@ -95,8 +95,8 @@ def create_bag_of_centroids( wordlist, word_centroid_map ):
         # Find all of the words for that cluster number, and print them out
         words = []
         for i in range(0,len(word_centroid_map.values())):
-            if( word_centroid_map.values()[i] == cluster ):
-                words.append(word_centroid_map.keys()[i])
+            if word_centroid_map.values()==cluster:
+                words.append(word_centroid_map.keys())
         print(words)
 
 

diff --git a/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/requirements.txt b/doc_ref/NLP/word2vec-nlp-tutorial/DeepLearningMovies/requirements.txt
@@ -1,6 +1,6 @@
-numpy==1.9.2
-scipy
-scikit-learn
-nltk
-pandas==0.16.0
+numpy==1.9.2
+scipy
+scikit-learn
+nltk
+pandas==0.16.0
 beautifulsoup4