minor edit

danieljf24 · May 8, 2018 · df6ac1a · df6ac1a
1 parent 1c24caf
commit df6ac1a
Show file tree

Hide file tree

Showing 21 changed files with 193 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+*~
+*.swp
+*.swo
diff --git a/.ipynb_checkpoints/w2vv_representation-checkpoint.ipynb b/.ipynb_checkpoints/w2vv_representation-checkpoint.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Word2VisualVec for Sentence Representation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This note answers the following two questions:\n",
+    "1. How to load a trained Word2VisualVec model?\n",
+    "2. How to predict visual features from a new sentence?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the following script to download and extract a Word2VisalVec model trained on flickr30k.\n",
+    "Notice that please refer to [here](https://github.com/danieljf24/w2vv#required-data) to download the dataset \n",
+    "\n",
+    "\n",
+    "```shell\n",
+    "ROOTPATH=$HOME/trained_w2vv_model\n",
+    "mkdir -p $ROOTPATH && cd $ROOTPATH\n",
+    "\n",
+    "# download and extract the pre-trained model\n",
+    "wget http://lixirong.net/data/w2vv-tmm2018/flickr30k_trained_model.tar.gz\n",
+    "tar zxf flickr30k_trained_model.tar.gz\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import keras\n",
+    "from basic.common import readPkl\n",
+    "from w2vv_pred import W2VV_MS_pred\n",
+    "from util.text import encode_text\n",
+    "from util.text2vec import get_text_encoder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load a trained Word2Visual model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "05/05/2018 21:41:43 INFO [w2vv_pred.pyc.W2VV_MS_pred] loaded a trained Word2VisualVec model successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_path = os.path.join(os.environ['HOME'],'trained_w2vv_model/flickr30k_trained_model')\n",
+    "abs_model_path = os.path.join(model_path, 'model.json')\n",
+    "weight_path = os.path.join(model_path, 'best_model.h5')\n",
+    "predictor = W2VV_MS_pred(abs_model_path, weight_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Predict visual features of a novel sentence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "05/05/2018 21:42:11 INFO [util/text2vec.pyc.Index2Vec] initializing ...\n",
+      "05/05/2018 21:42:11 INFO [util/text2vec.pyc.BoW2VecFilterStop] initializing ...\n",
+      "05/05/2018 21:42:11 INFO [util/text2vec.pyc.BoW2VecFilterStop] 7253 words\n",
+      "05/05/2018 21:42:11 INFO [util/text2vec.pyc.AveWord2VecFilterStop] initializing ...\n",
+      "[BigFile] 1743364x500 instances loaded from /home/daniel/VisualSearch/word2vec/flickr/vec500flickr30m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# setup multi-scale sentence vectorization\n",
+    "trainCollection='flickr30kenctrain'\n",
+    "opt = readPkl(os.path.join(model_path, 'option.pkl'))\n",
+    "rootpath=os.path.join(os.environ['HOME'],'VisualSearch')\n",
+    "rnn_style, bow_style, w2v_style = opt.text_style.strip().split('@')\n",
+    "text_data_path = os.path.join(rootpath, trainCollection, \"TextData\", \"vocabulary\", \"bow\", opt.rnn_vocab)\n",
+    "bow_data_path = os.path.join(rootpath, trainCollection, \"TextData\", \"vocabulary\", bow_style, opt.bow_vocab)\n",
+    "w2v_data_path = os.path.join(rootpath, \"word2vec\", opt.corpus,  opt.word2vec)\n",
+    "\n",
+    "text2vec = get_text_encoder(rnn_style)(text_data_path)\n",
+    "bow2vec = get_text_encoder(bow_style)(bow_data_path)\n",
+    "w2v2vec = get_text_encoder(w2v_style)(w2v_data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2048\n",
+      "[ 0.30943465  0.29305869  0.40463841 ...,  0.90311915  0.62051922\n",
+      "  0.58120167]\n"
+     ]
+    }
+   ],
+   "source": [
+    "sent='a dog is playing with a cat'\n",
+    "rnn_vec, bow_w2v_vec = encode_text(opt,text2vec,bow2vec,w2v2vec,sent)\n",
+    "predicted_text_feat = predictor.predict_one(rnn_vec,bow_w2v_vec)\n",
+    "print len(predicted_text_feat)\n",
+    "print predicted_text_feat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
@@ -1,5 +1,4 @@
 # Word2VisualVec
----
 
 This package is a python implementation of image caption retrieval using Word2VisualVec. [Word2VisualVec](https://ieeexplore.ieee.org/document/8353472/) is a deep neural network architecture that learns to predict a deep visual feature of textual input based on multi-scale sentence vectorization and a multi-layer perceptron, which has been already successfully used in image(video) caption retrieval [1] and video captioning [2]. Moreover, it is also our winning solution for Matching and Ranking subtask of Video to Text Description in TRECVID 2016 [3].
 
@@ -14,7 +13,7 @@ This package is a python implementation of image caption retrieval using Word2Vi
 * **pydot** for keras visualization
 * **tensorboard_logger** for tensorboard visualization
 
-We have used virtualenv to setup a deep learning workspace that supports keras with TensorFlow backend.
+We used virtualenv to setup a deep learning workspace that supports keras with TensorFlow backend.
 Run the following script to install the required packages.
 ```shell
 virtualenv --system-site-packages ~/w2vv
@@ -132,4 +131,4 @@ Word2VisualVec essentially generates a dense representation of a given sentence.
 ## References
 1. Jianfeng Dong, Xirong Li and Cees G. M. Snoek. [Predicting Visual Features from Text for Image and Video Caption Retrieval](https://ieeexplore.ieee.org/document/8353472/). IEEE Transactions on Multimedia, 2018.
 2. Jianfeng Dong, Xirong Li, Weiyu Lan, Yujia Huo, and Cees G. M. Snoek. [Early Embedding and Late Reranking for Video Captioning](http://dl.acm.org/citation.cfm?id=2984064). ACM Multimedia, 2016 (Grand Challenge Award Paper).
-3. Cees G. M. Snoek, Jianfeng Dong, Xirong Li, Xiaoxu Wang, Qijie Wei, Weiyu Lan, Efstratios Gavves, Noureldien Hussein, Dennis C. Koelma, Arnold W. M. Smeulders. [University of Amsterdam and Renmin University at TRECVID 2016: Searching Video, Detecting Events and Describing Video](https://www-nlpir.nist.gov/projects/tvpubs/tv16.papers/mediamill.pdf). TRECVID Workshop, 2016.
+3. Cees G. M. Snoek, Jianfeng Dong, Xirong Li, Xiaoxu Wang, Qijie Wei, Weiyu Lan, Efstratios Gavves, Noureldien Hussein, Dennis C. Koelma, Arnold W. M. Smeulders. [University of Amsterdam and Renmin University at TRECVID 2016: Searching Video, Detecting Events and Describing Video](https://www-nlpir.nist.gov/projects/tvpubs/tv16.papers/mediamill.pdf). TRECVID Workshop, 2016.
diff --git a/basic/__init__.pyc b/basic/__init__.pyc
diff --git a/basic/common.pyc b/basic/common.pyc
diff --git a/basic/constant.pyc b/basic/constant.pyc
diff --git a/basic/metric.pyc b/basic/metric.pyc
diff --git a/simpleknn/__init__.pyc b/simpleknn/__init__.pyc
diff --git a/simpleknn/bigfile.pyc b/simpleknn/bigfile.pyc
diff --git a/simpleknn/txt2bin.pyc b/simpleknn/txt2bin.pyc
diff --git a/util/__init__.pyc b/util/__init__.pyc
diff --git a/util/dataset.pyc b/util/dataset.pyc
diff --git a/util/eval_perf.pyc b/util/eval_perf.pyc
diff --git a/util/evaluation.pyc b/util/evaluation.pyc
diff --git a/util/losser.pyc b/util/losser.pyc
diff --git a/util/text.pyc b/util/text.pyc
diff --git a/util/text2vec.pyc b/util/text2vec.pyc
diff --git a/util/util.pyc b/util/util.pyc
diff --git a/w2vv.pyc b/w2vv.pyc
diff --git a/w2vv_pred.pyc b/w2vv_pred.pyc
diff --git a/w2vv_representation.ipynb b/w2vv_representation.ipynb
@@ -28,7 +28,7 @@
    "metadata": {},
    "source": [
     "Use the following script to download and extract a Word2VisalVec model trained on flickr30k.\n",
-    "Notice that please refer to [here](https://gitee.com/danieljf24/w2vv#required-data) to download the dataset \n",
+    "Notice that please refer to [here](https://github.com/danieljf24/w2vv#required-data) to download the dataset \n",
     "\n",
     "\n",
     "```shell\n",