Merge branch 'master' of https://github.com/gchrupala/Passage

gchrupala · Mar 22, 2015 · 029b836 · 029b836
2 parents 32927b7 + 07a5ff6
commit 029b836
Show file tree

Hide file tree

Showing 9 changed files with 192 additions and 15 deletions.
diff --git a/AUTHORS.txt b/AUTHORS.txt
@@ -0,0 +1,4 @@
+Alec Radford <alec@indico.io>
+Madison May <madison@indico.io>
+Slater Victoroff <slater@indico.io>
+Grzegorz Chrupala <pitekus@gmail.com>
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -0,0 +1,4 @@
+v0.2.1, Mon Feb 16 -- Added pip package, updated README
+v0.2.2, Tue Feb 24 -- Updated readme, added readme to pip page
+v0.2.3, Tue Feb 24 -- Added setup.cfg to properly handle markdown readme on pypi page
+v0.2.4, Tue Feb 24 -- Changed to legitimate .rst README, removed setup.cfg
diff --git a/README.md b/README.md
@@ -1,13 +1,22 @@
-# Passage
+**Passage**
+===================
 A little library for text analysis with RNNs.
 
 Warning: very alpha, work in progress.
 
-Install
+## Install
+
+via Github (version under active development)
 ```
+git clone http://github.com/IndicoDataSolutions/passage.git
 python setup.py develop
 ```
+or via pip
+```
+sudo pip install passage
+```
 
+## Example
 Using Passage to do binary classification of text, this example:
 
 * Tokenizes some training text, converting it to a format Passage can use.
@@ -44,4 +53,13 @@ Where:
 
 * train_text is a list of strings ['hello world', 'foo bar']
 * train_labels is a list of labels [0, 1]
-* test_text is another list of strings
+* test_text is another list of strings
+
+## Datasets
+
+Without sizeable datasets RNNs have difficulty achieving results better than traditional sparse linear models. Below are a few datasets that are appropriately sized, useful for experimentation. Hopefully this list will grow over time, please feel free to propose new datasets for inclusion through either an issue or a pull request.
+
+**__Note__**: __None of these datasets were created by indico, not should their inclusion here indicate any kind of endorsement__
+
+Blogger Dataset: http://www.cs.biu.ac.il/~koppel/blogs/blogs.zip (Age and gender data)
+
diff --git a/README.rst b/README.rst
@@ -0,0 +1,79 @@
+Passage
+=======
+
+A little library for text analysis with RNNs.
+
+Warning: very alpha, work in progress.
+
+Install
+-------
+
+via Github (version under active development)
+
+::
+
+    git clone http://github.com/IndicoDataSolutions/passage.git
+    python setup.py develop
+
+or via pip
+
+::
+
+    sudo pip install passage
+
+Example
+-------
+
+Using Passage to do binary classification of text, this example:
+
+-  Tokenizes some training text, converting it to a format Passage can
+   use.
+-  Defines the model's structure as a list of layers.
+-  Creates the model with that structure and a cost to be optimized.
+-  Trains the model for one iteration over the training text.
+-  Uses the model and tokenizer to predict on new text.
+-  Saves and loads the model.
+
+::
+
+    from passage.preprocessing import Tokenizer
+    from passage.layers import Embedding, GatedRecurrent, Dense
+    from passage.models import RNN
+    from passage.utils import save, load
+
+    tokenizer = Tokenizer()
+    train_tokens = tokenizer.fit_transform(train_text)
+
+    layers = [
+        Embedding(size=128, n_features=tokenizer.n_features),
+        GatedRecurrent(size=128),
+        Dense(size=1, activation='sigmoid')
+    ]
+
+    model = RNN(layers=layers, cost='BinaryCrossEntropy')
+    model.fit(train_tokens, train_labels)
+
+    model.predict(tokenizer.transform(test_text))
+    save(model, 'save_test.pkl')
+    model = load('save_test.pkl')
+
+Where:
+
+-  train\_text is a list of strings ['hello world', 'foo bar']
+-  train\_labels is a list of labels [0, 1]
+-  test\_text is another list of strings
+
+Datasets
+--------
+
+Without sizeable datasets RNNs have difficulty achieving results better
+than traditional sparse linear models. Below are a few datasets that are
+appropriately sized, useful for experimentation. Hopefully this list
+will grow over time, please feel free to propose new datasets for
+inclusion through either an issue or a pull request.
+
+****Note****: **None of these datasets were created by indico, not
+should their inclusion here indicate any kind of endorsement**
+
+Blogger Dataset: http://www.cs.biu.ac.il/~koppel/blogs/blogs.zip (Age
+and gender data)
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,13 @@
+**Passage Examples**
+===================
+[Slide Deck](https://docs.google.com/presentation/d/1HYfUZLRZRJovQpv5mYxox9bz9erxj7Ak_ZovENMvM90/edit?usp=sharing) & [Video](https://www.youtube.com/watch?v=VINCQghQRuM)
+
+<a href="https://www.youtube.com/watch?v=VINCQghQRuM"><img src="http://i.imgur.com/bJC0pjy.png" height="300"></a>
+
+[Passage Gender Classification](https://github.com/IndicoDataSolutions/Passage/blob/master/examples/gender.py) With [Blogger Dataset](http://goo.gl/EbWA1u)
+
+<a href="https://github.com/IndicoDataSolutions/Passage/blob/master/examples/gender.py"><img src="http://i.imgur.com/cEmonmC.jpg" height="300"></a>
+
+[Passage Newsgroup Classification Example](https://github.com/IndicoDataSolutions/Passage/blob/master/examples/newsgroup.py)
+
+<a href="https://github.com/IndicoDataSolutions/Passage/blob/master/examples/newsgroup.py"><img src="http://i.imgur.com/ByTczHW.jpg" height="300"></a>
diff --git a/example.py → examples/gender.py b/example.py → examples/gender.py
@@ -10,34 +10,33 @@
 
 def load_gender_data(ntrain=10000, ntest=10000):
     file_loc = os.path.dirname(os.path.realpath(__file__))
-    relative_path = "data/blogger_data_2.csv"
+    relative_path = "blogger_data_2.csv" # move dataset to examples directory
     fullpath = os.path.join(file_loc, relative_path)
     data = pd.read_csv(fullpath, nrows=ntrain+ntest)
     X = data['text'].values
-    X = [str(x) for x in X] #ugly nan cleaner
+    X = [str(x) for x in X] # ugly nan cleaner
     Y = data['gender'].values
     trX = X[:-ntest]
     teX = X[-ntest:]
     trY = Y[:-ntest]
     teY = Y[-ntest:]
     return trX, teX, trY, teY
 
-trX, teX, trY, teY = load_gender_data(ntrain=10000) #Can increase up to 250K or so
+trX, teX, trY, teY = load_gender_data(ntrain=10000) # Can increase up to 250K or so
 
 tokenizer = Tokenizer(min_df=10, max_features=50000)
-print trX[1:2] #see a blog example
+print trX[1] # see a blog example
 trX = tokenizer.fit_transform(trX)
 teX = tokenizer.transform(teX)
-print tokenizer.inverse_transform(trX[1:2]) #see what words are kept
 print tokenizer.n_features
 
 layers = [
     Embedding(size=128, n_features=tokenizer.n_features),
     GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False),
-    Dense(size=1, activation='sigmoid', init='orthogonal') #sigmoid for binary classification
+    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
 ]
 
-model = RNN(layers=layers, cost='bce') #bce is classification loss for binary classification and sigmoid output
+model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
 for i in range(2):
     model.fit(trX, trY, n_epochs=1)
     tr_preds = model.predict(trX[:len(teY)])
@@ -58,4 +57,4 @@ def load_gender_data(ntrain=10000, ntest=10000):
 tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
 te_acc = metrics.accuracy_score(teY, te_preds > 0.5)
 
-print tr_acc, te_acc #should be same scores as above
+print tr_acc, te_acc
diff --git a/examples/newsgroup.py b/examples/newsgroup.py
@@ -0,0 +1,44 @@
+from sklearn.datasets import fetch_20newsgroups
+categories = ['alt.atheism', 'sci.space']
+newsgroups_train = fetch_20newsgroups(subset='train',
+                                      remove=('headers', 'footers', 'quotes'),
+                                      categories=categories)
+newsgroups_test = fetch_20newsgroups(subset='test',
+                                     remove=('headers', 'footers', 'quotes'),
+                                     categories=categories)
+
+print len(newsgroups_train.data), len(newsgroups_test.data)
+
+from sklearn import metrics
+from passage.preprocessing import Tokenizer
+from passage.layers import Embedding, GatedRecurrent, Dense
+from passage.models import RNN
+from passage.utils import save
+
+tokenizer = Tokenizer(min_df=10, max_features=50000)
+X_train = tokenizer.fit_transform(newsgroups_train.data)
+X_test  = tokenizer.transform(newsgroups_test.data)
+Y_train = newsgroups_train.target
+Y_test  = newsgroups_test.target
+
+print tokenizer.n_features
+
+layers = [
+    Embedding(size=128, n_features=tokenizer.n_features),
+    GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid',
+    			   init='orthogonal', seq_output=False),
+    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
+]
+
+model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
+for i in range(2):
+    model.fit(X_train, Y_train, n_epochs=1)
+    tr_preds = model.predict(X_train[:len(Y_test)])
+    te_preds = model.predict(X_test)
+
+    tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5)
+    te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5)
+
+    print i, tr_acc, te_acc # dataset too small to fully utilize Passage
+
+save(model, 'model.pkl')
diff --git a/passage/models.py b/passage/models.py
@@ -44,7 +44,7 @@ def __init__(self, layers, cost, updater='Adam', verbose=2, Y=T.matrix(), iterat
         self.y_tr = self.layers[-1].output(dropout_active=True)
         self.y_te = self.layers[-1].output(dropout_active=False)
         self.Y = Y
-
+                                     
         cost = self.cost(self.Y, self.y_tr)
         self.updates = self.updater.get_updates(self.params, cost)
 
@@ -53,6 +53,21 @@ def __init__(self, layers, cost, updater='Adam', verbose=2, Y=T.matrix(), iterat
         self._predict = theano.function([self.X], self.y_te)
 
     def fit(self, trX, trY, batch_size=64, n_epochs=1, len_filter=LenFilter(), snapshot_freq=1, path=None):
+        """Train model on given training examples and return the list of costs after each minibatch is processed.
+
+        Args:
+          trX (list) -- Inputs
+          trY (list) -- Outputs
+          batch_size (int, optional) -- number of examples in a minibatch (default 64)
+          n_epochs (int, optional)  -- number of epochs to train for (default 1)
+          len_filter (object, optional) -- object to filter training example by length (default LenFilter())
+          snapshot_freq (int, optional) -- number of epochs between saving model snapshots (default 1)
+          path (str, optional) -- prefix of path where model snapshots are saved.
+            If None, no snapshots are saved (default None)
+
+        Returns:
+          list -- costs of model after processing each minibatch
+        """
         if len_filter is not None:
             trX, trY = len_filter.filter(trX, trY)
         trY = standardize_targets(trY, cost=self.cost)

diff --git a/setup.py b/setup.py
@@ -1,12 +1,13 @@
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages    
 
 setup(
     name='passage', 
-    version='0.2',
+    version='0.2.4',
     packages=find_packages(),
     description="""
         A little library for text analysis with RNNs.
     """,
+    long_description=open('README.rst').read(),
     license="MIT License (See LICENSE)",
     url="https://github.com/IndicoDataSolutions/Passage",
     author="Alec Radford, Madison May",
@@ -18,4 +19,4 @@
         "numpy >= 1.8.1",
         "Theano >= 0.6.0",
     ],
-)
+)