In [None]:
# %load run_all.py

"""
__file__

	run_all.py

__description___
	
	This file generates all the features in one shot.

__author__

	Chenglong Chen < c.chenglong@gmail.com >

"""

import os

#################
## Preprocesss ##
#################
#### preprocess data
cmd = "python ./preprocess.py"
os.system(cmd)

# #### generate kfold
# cmd = "python ./gen_kfold.py"
# os.system(cmd)

#######################
## Generate features ##
#######################
#### query id
cmd = "python ./genFeat_id_feat.py"
os.system(cmd)

#### counting feat
cmd = "python ./genFeat_counting_feat.py"
os.system(cmd)

#### distance feat
cmd = "python ./genFeat_distance_feat.py"
os.system(cmd)

#### basic tfidf
cmd = "python ./genFeat_basic_tfidf_feat.py"
os.system(cmd)

#### cooccurrence tfidf
cmd = "python ./genFeat_cooccurrence_tfidf_feat.py"
os.system(cmd)


#####################
## Combine Feature ##
#####################
#### combine feat
cmd = "python ./combine_feat_[LSA_and_stats_feat_Jun09]_[Low].py"
os.system(cmd)

#### combine feat
cmd = "python ./combine_feat_[LSA_svd150_and_Jaccard_coef_Jun14]_[Low].py"
os.system(cmd)

#### combine feat
cmd = "python ./combine_feat_[svd100_and_bow_Jun23]_[Low].py"
os.system(cmd)

#### combine feat
cmd = "python ./combine_feat_[svd100_and_bow_Jun27]_[High].py"
os.system(cmd)

In [6]:
cmd = '''git add -A
git commit -m"add data"
git push'''
os.system(cmd)

0

In [3]:
import os

In [1]:
# %load preprocess.py

"""
__file__

    preprocess.py

__description__

    This file preprocesses data.

__author__

    Chenglong Chen
    
"""

import sys
import cPickle
import numpy as np
import pandas as pd
from nlp_utils import clean_text, pos_tag_text
sys.path.append("../")
from param_config import config

###############
## Load Data ##
###############
print("Load data...")

dfTrain = pd.read_csv(config.original_train_data_path).fillna("")
dfTest = pd.read_csv(config.original_test_data_path).fillna("")
# number of train/test samples
num_train, num_test = dfTrain.shape[0], dfTest.shape[0]

print("Done.")


######################
## Pre-process Data ##
######################
print("Pre-process data...")

## insert fake label for test
dfTest["median_relevance"] = np.ones((num_test))
dfTest["relevance_variance"] = np.zeros((num_test))

## insert sample index
dfTrain["index"] = np.arange(num_train)
dfTest["index"] = np.arange(num_test)

## one-hot encode the median_relevance
for i in range(config.n_classes):
    dfTrain["median_relevance_%d" % (i+1)] = 0
    dfTrain["median_relevance_%d" % (i+1)][dfTrain["median_relevance"]==(i+1)] = 1
    
## query ids
qid_dict = dict()
for i,q in enumerate(np.unique(dfTrain["query"]), start=1):
    qid_dict[q] = i
    
## insert query id
dfTrain["qid"] = map(lambda q: qid_dict[q], dfTrain["query"])
dfTest["qid"] = map(lambda q: qid_dict[q], dfTest["query"])

## clean text
clean = lambda line: clean_text(line, drop_html_flag=config.drop_html_flag)
dfTrain = dfTrain.apply(clean, axis=1)
dfTest = dfTest.apply(clean, axis=1)

print("Done.")


###############
## Save Data ##
###############
print("Save data...")

with open(config.processed_train_data_path, "wb") as f:
    cPickle.dump(dfTrain, f, -1)
with open(config.processed_test_data_path, "wb") as f:
    cPickle.dump(dfTest, f, -1)
    
print("Done.")


"""
## pos tag text
dfTrain = dfTrain.apply(pos_tag_text, axis=1)
dfTest = dfTest.apply(pos_tag_text, axis=1)
with open(config.pos_tagged_train_data_path, "wb") as f:
    cPickle.dump(dfTrain, f, -1)
with open(config.pos_tagged_test_data_path, "wb") as f:
    cPickle.dump(dfTest, f, -1)
print("Done.")
"""

Load data...
Done.

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Sou


Pre-process data...
Done.
Save data...
Done.


'\n## pos tag text\ndfTrain = dfTrain.apply(pos_tag_text, axis=1)\ndfTest = dfTest.apply(pos_tag_text, axis=1)\nwith open(config.pos_tagged_train_data_path, "wb") as f:\n    cPickle.dump(dfTrain, f, -1)\nwith open(config.pos_tagged_test_data_path, "wb") as f:\n    cPickle.dump(dfTest, f, -1)\nprint("Done.")\n'

In [4]:
cmd = '''git add -A
git commit -m"add data"
git push'''
os.system(cmd)

0

In [None]:
# %load genFeat_counting_feat.py

"""
__file__

    genFeat_counting_feat.py

__description__

    This file generates the following features for each run and fold, and for the entire training and testing set.

        1. Basic Counting Features
            
            1. Count of n-gram in query/title/description

            2. Count & Ratio of Digit in query/title/description

            3. Count & Ratio of Unique n-gram in query/title/description

        2. Intersect Counting Features

            1. Count & Ratio of a's n-gram in b's n-gram

        3. Intersect Position Features

            1. Statistics of Positions of a's n-gram in b's n-gram

            2. Statistics of Normalized Positions of a's n-gram in b's n-gram

__author__

    Chenglong Chen < c.chenglong@gmail.com >

"""

import re
import sys
import ngram
import cPickle
import numpy as np
from nlp_utils import stopwords, english_stemmer, stem_tokens
from feat_utils import try_divide, dump_feat_name
sys.path.append("../")
from param_config import config



def get_position_list(target, obs):
    """
        Get the list of positions of obs in target
    """
    pos_of_obs_in_target = [0]
    if len(obs) != 0:
        pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
        if len(pos_of_obs_in_target) == 0:
            pos_of_obs_in_target = [0]
    return pos_of_obs_in_target


######################
## Pre-process data ##
######################
token_pattern = r"(?u)\b\w\w+\b"
#token_pattern = r'\w{1,}'
#token_pattern = r"\w+"
#token_pattern = r"[\w']+"
def preprocess_data(line,
                    token_pattern=token_pattern,
                    exclude_stopword=config.cooccurrence_word_exclude_stopword,
                    encode_digit=False):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
    ## tokenize
    tokens = [x.lower() for x in token_pattern.findall(line)]
    ## stem
    tokens_stemmed = stem_tokens(tokens, english_stemmer)
    if exclude_stopword:
        tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords]
    return tokens_stemmed


def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))


    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["query", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

    ## description missing indicator
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))


    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

        ## some other feat
        df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])


    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])


if __name__ == "__main__":

    ###############
    ## Load Data ##
    ###############
    ## load data
    with open(config.processed_train_data_path, "rb") as f:
        dfTrain = cPickle.load(f)
    with open(config.processed_test_data_path, "rb") as f:
        dfTest = cPickle.load(f)
    ## load pre-defined stratified k-fold index
    with open("%s/stratifiedKFold.%s.pkl" % (config.data_folder, config.stratified_label), "rb") as f:
            skf = cPickle.load(f)

    ## file to save feat names
    feat_name_file = "%s/counting.feat_name" % config.feat_folder


    #######################
    ## Generate Features ##
    #######################
    print("==================================================")
    print("Generate counting features...")


    extract_feat(dfTrain)
    feat_names = [
        name for name in dfTrain.columns \
            if "count" in name \
            or "ratio" in name \
            or "div" in name \
            or "pos_of" in name
    ]
    feat_names.append("description_missing")


    print("For cross-validation...")
    for run in range(config.n_runs):
        ## use 33% for training and 67 % for validation
        ## so we switch trainInd and validInd
        for fold, (validInd, trainInd) in enumerate(skf[run]):
            print("Run: %d, Fold: %d" % (run+1, fold+1))
            path = "%s/Run%d/Fold%d" % (config.feat_folder, run+1, fold+1)
              
            #########################
            ## get word count feat ##
            #########################
            for feat_name in feat_names:
                X_train = dfTrain[feat_name].values[trainInd]
                X_valid = dfTrain[feat_name].values[validInd]
                with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(X_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(X_valid, f, -1)
    print("Done.")


    print("For training and testing...")
    path = "%s/All" % config.feat_folder
    ## use full version for X_train
    extract_feat(dfTest)
    for feat_name in feat_names:
        X_train = dfTrain[feat_name].values
        X_test = dfTest[feat_name].values
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_train, f, -1)
        with open("%s/test.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_test, f, -1)
            
    ## save feat names
    print("Feature names are stored in %s" % feat_name_file)
    ## dump feat name
    dump_feat_name(feat_names, feat_name_file)

    print("All Done.")

Generate counting features...
generate unigram
generate bigram
generate trigram
generate word counting features
generate intersect word counting features
generate intersect word position features
For cross-validation...
Run: 1, Fold: 1
Run: 1, Fold: 2
Run: 1, Fold: 3
Run: 2, Fold: 1
Run: 2, Fold: 2
Run: 2, Fold: 3
Run: 3, Fold: 1
Run: 3, Fold: 2
Run: 3, Fold: 3
Done.
For training and testing...
generate unigram
generate bigram
generate trigram
generate word counting features
generate intersect word counting features
generate intersect word position features

In [None]:
cmd = '''git add -A
git commit -m"add data"
git push'''
os.system(cmd)

In [None]:
# %load genFeat_basic_tfidf_feat.py

"""
__file__

    genFeat_basic_tfidf_feat.py

__description__

    This file generates the following features for each run and fold, and for the entire training and testing set.

        1. basic tfidf features for query/title/description
            - use common vocabulary among query/title/description for further computation of cosine similarity

        2. cosine similarity between query & title, query & description, title & description pairs
            - just plain cosine similarity

        3. cosine similarity stats features for title/description
            - computation is carried out with regard to a pool of samples grouped by:
                - median_relevance (#4)
                - query (qid) & median_relevance (#4)
            - cosine similarity for the following pairs are computed for each sample
                - sample title        vs.  pooled sample titles
                - sample description  vs.  pooled sample descriptions
                Note that in the pool samples, we exclude the current sample being considered.
            - stats features include quantiles of cosine similarity and others defined in the variable "stats_func", e.g.,
                - mean value
                - standard deviation (std)
                - more can be added, e.g., moment features etc

        4. SVD version of the above features

__author__

    Chenglong Chen < c.chenglong@gmail.com >

"""

import sys
import cPickle
import numpy as np
import pandas as pd
from copy import copy
from scipy.sparse import vstack
from nlp_utils import getTFV, getBOW
from feat_utils import get_sample_indices_by_relevance, dump_feat_name
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
sys.path.append("../")
from param_config import config


stats_feat_flag = True


#####################
## Helper function ##
#####################
## compute cosine similarity
def cosine_sim(x, y):
    try:
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print x
        print y
        d = 0.
    return d

## generate distance stats feat
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict, qids_test=None):
    if metric == "cosine":
        stats_feat = 0 * np.ones((len(ids_test), stats_feat_num*config.n_classes), dtype=float)
        sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
    elif metric == "euclidean":
        stats_feat = -1 * np.ones((len(ids_test), stats_feat_num*config.n_classes), dtype=float)
        sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)

    for i in range(len(ids_test)):
        id = ids_test[i]
        if qids_test is not None:
            qid = qids_test[i]
        for j in range(config.n_classes):
            key = (qid, j+1) if qids_test is not None else j+1
            if indices_dict.has_key(key):
                inds = indices_dict[key]
                # exclude this sample itself from the list of indices
                inds = [ ind for ind in inds if id != ids_train[ind] ]
                sim_tmp = sim[i][inds]
                if len(sim_tmp) != 0:
                    feat = [ func(sim_tmp) for func in stats_func ]
                    ## quantile
                    sim_tmp = pd.Series(sim_tmp)
                    quantiles = sim_tmp.quantile(quantiles_range)
                    feat = np.hstack((feat, quantiles))
                    stats_feat[i,j*stats_feat_num:(j+1)*stats_feat_num] = feat
    return stats_feat


## extract all features
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names):

    new_feat_names = copy(feat_names)
    ## first fit a bow/tfidf on the all_text to get
    ## the common vocabulary to ensure query/title/description
    ## has the same length bow/tfidf for computing the similarity
    if vocabulary_type == "common":
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range)
        vec.fit(dfTrain["all_text"])
        vocabulary = vec.vocabulary_
    elif vocabulary_type == "individual":
        vocabulary = None
    for feat_name,column_name in zip(feat_names, column_names):

        ##########################
        ## basic bow/tfidf feat ##
        ##########################
        print "generate %s feat for %s" % (vec_type, column_name)
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary)
        X_train = vec.fit_transform(dfTrain[column_name])
        X_test = vec.transform(dfTest[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_train, f, -1)
        with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f:
            cPickle.dump(X_test, f, -1)
        
        if stats_feat_flag:
            #####################################
            ## bow/tfidf cosine sim stats feat ##
            #####################################
            ## get the indices of pooled samples
            relevance_indices_dict = get_sample_indices_by_relevance(dfTrain)
            query_relevance_indices_dict = get_sample_indices_by_relevance(dfTrain, "qid")
            ## skip query part
            if column_name in ["product_title", "product_description"]:
                print "generate %s stats feat for %s" % (vec_type, column_name)
                ## train
                cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                    X_train, dfTrain["id"].values,
                                                                    relevance_indices_dict)
                cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                            X_train, dfTrain["id"].values,
                                                                            query_relevance_indices_dict, dfTrain["qid"].values)
                with open("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                with open("%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                ## test
                cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                    X_test, dfTest["id"].values,
                                                                    relevance_indices_dict)
                cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                            X_test, dfTest["id"].values,
                                                                            query_relevance_indices_dict, dfTest["qid"].values)
                with open("%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1)
                with open("%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1)

                ## update feat names
                new_feat_names.append( "%s_cosine_sim_stats_feat_by_relevance" % feat_name )
                new_feat_names.append( "%s_cosine_sim_stats_feat_by_query_relevance" % feat_name )


    #####################
    ## cosine sim feat ##
    #####################
    for i in range(len(feat_names)-1):
        for j in range(i+1,len(feat_names)):
            print "generate common %s cosine sim feat for %s and %s" % (vec_type, feat_names[i], feat_names[j])
            for mod in ["train", mode]:
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]), "rb") as f:
                    target_vec = cPickle.load(f)
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]), "rb") as f:
                    obs_vec = cPickle.load(f)
                sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:,np.newaxis]
                ## dump feat
                with open("%s/%s.%s_%s_%s_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type), "wb") as f:
                    cPickle.dump(sim, f, -1)
            ## update feat names
            new_feat_names.append( "%s_%s_%s_cosine_sim" % (feat_names[i], feat_names[j], vec_type))


    ##################
    ## SVD features ##
    ##################
    ## we fit svd use stacked query/title/description bow/tfidf for further cosine simalirity computation
    for i,feat_name in enumerate(feat_names):
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
            X_vec_train = cPickle.load(f)
        if i == 0:
            X_vec_all_train = X_vec_train
        else:
            X_vec_all_train = vstack([X_vec_all_train, X_vec_train])

    for n_components in svd_n_components:
        svd = TruncatedSVD(n_components=n_components, n_iter=15)
        svd.fit(X_vec_all_train)
        ## load bow/tfidf (for less coding...)
        for feat_name,column_name in zip(feat_names, column_names):
            print "generate common %s-svd%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            X_svd_train = svd.transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open("%s/train.%s_common_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)

            ## update feat names
            new_feat_names.append( "%s_common_svd%d" % (feat_name, n_components) )
            
            if stats_feat_flag:
                #####################################
                ## bow/tfidf-svd cosine sim stats feat ##
                #####################################
                if column_name in ["product_title", "product_description"]:
                    print "generate common %s-svd%d stats feat for %s" % (vec_type, n_components, column_name)
                    ## train
                    cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                        X_svd_train, dfTrain["id"].values,
                                                                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                                X_svd_train, dfTrain["id"].values,
                                                                                query_relevance_indices_dict, dfTrain["qid"].values)
                    with open("%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                    with open("%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                    ## test
                    cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                        X_svd_test, dfTest["id"].values,
                                                                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                                X_svd_test, dfTest["id"].values,
                                                                                query_relevance_indices_dict, dfTest["qid"].values)
                    with open("%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1)
                    with open("%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1)

                    ## update feat names
                    new_feat_names.append( "%s_common_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components) )
                    new_feat_names.append( "%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components) )


        #####################
        ## cosine sim feat ##
        #####################
        for i in range(len(feat_names)-1):
            for j in range(i+1,len(feat_names)):
                print "generate common %s-svd%d cosine sim feat for %s and %s" % (vec_type, n_components, feat_names[i], feat_names[j])
                for mod in ["train", mode]:
                    with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[i], n_components), "rb") as f:
                        target_vec = cPickle.load(f)
                    with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[j], n_components), "rb") as f:
                        obs_vec = cPickle.load(f)
                    sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:,np.newaxis]
                    ## dump feat
                    with open("%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type, n_components), "wb") as f:
                        cPickle.dump(sim, f, -1)
                ## update feat names
                new_feat_names.append( "%s_%s_%s_common_svd%d_cosine_sim" % (feat_names[i], feat_names[j], vec_type, n_components))

        #########################
        ## Individual SVD feat ##
        #########################
        ## generate individual svd feat
        for feat_name,column_name in zip(feat_names, column_names):
            print "generate individual %s-svd%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            svd = TruncatedSVD(n_components=n_components, n_iter=15)
            X_svd_train = svd.fit_transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open("%s/%s.%s_individual_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)
            ## update feat names
            new_feat_names.append( "%s_individual_svd%d" % (feat_name, n_components) )

            if stats_feat_flag:
                #########################################
                ## bow/tfidf-svd cosine sim stats feat ##
                #########################################
                if column_name in ["product_title", "product_description"]:
                    print "generate individual %s-svd%d stats feat for %s" % (vec_type, n_components, column_name)
                    ## train
                    cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                        X_svd_train, dfTrain["id"].values,
                                                                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                                X_svd_train, dfTrain["id"].values,
                                                                                query_relevance_indices_dict, dfTrain["qid"].values)
                    with open("%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                    with open("%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                    ## test
                    cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                        X_svd_test, dfTest["id"].values,
                                                                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat("cosine", X_svd_train, dfTrain["id"].values,
                                                                                X_svd_test, dfTest["id"].values,
                                                                                query_relevance_indices_dict, dfTest["qid"].values)
                    with open("%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1)
                    with open("%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1)

                    ## update feat names
                    new_feat_names.append( "%s_individual_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components) )
                    new_feat_names.append( "%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components) )

    """
    #########################
    ## bow/tfidf-tsne feat ##
    #########################
    ## generate t-sne feat
    for n_components in tsne_n_components:
        for feat_name,column_name in zip(feat_names, column_names):
            print "generate individual %s-tsne%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            tsne = TSNE(n_components=n_components, init='pca', random_state=2015, metric="cosine")
            X = vstack([X_vec_train, X_vec_test])
            Y = tsne.fit_transform(X)
            num_train = X_vec_train.shape[0]
            X_tsne_train = Y[:num_train]
            X_tsne_test = Y[num_train:]
            with open("%s/train.%s_individual_tsne%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_tsne_train, f, -1)
            with open("%s/%s.%s_individual_tsne%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_tsne_test, f, -1)

            ##################################################
            ## bow/tfidf-tsne euclidean distance stats feat ##
            ##################################################
            if column_name in ["product_title", "product_description"]:
                print "generate individual %s-tsne%d stats feat for %s" % (vec_type, n_components, column_name)
                ## train
                euclidean_dist_stats_feat_by_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                    X_tsne_train, dfTrain["id"].values,
                                                                    relevance_indices_dict)
                euclidean_dist_stats_feat_by_query_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                            X_tsne_train, dfTrain["id"].values,
                                                                            query_relevance_indices_dict, dfTrain["qid"].values)
                with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_relevance_train, f, -1)
                with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_train, f, -1)
                ## test
                euclidean_dist_stats_feat_by_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                    X_tsne_test, dfTest["id"].values,
                                                                    relevance_indices_dict)
                euclidean_dist_stats_feat_by_query_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                            X_tsne_test, dfTest["id"].values,
                                                                            query_relevance_indices_dict, dfTest["qid"].values)
                with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_relevance_test, f, -1)
                with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_test, f, -1)

                ## update feat names
                new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance" % (feat_name, n_components) )
                new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance" % (feat_name, n_components) )
    """

    return new_feat_names


if __name__ == "__main__":

    ############
    ## Config ##
    ############
    ## stats to extract
    quantiles_range = np.arange(0, 1.5, 0.5)
    stats_func = [ np.mean, np.std ]
    stats_feat_num = len(quantiles_range) + len(stats_func)

    ## tfidf config
    vec_types = [ "tfidf", "bow" ]
    ngram_range = config.basic_tfidf_ngram_range
    vocabulary_type = config.basic_tfidf_vocabulary_type
    svd_n_components = [100, 150]
    tsne_n_components = [2]

    ## feat name config
    column_names = [ "query", "product_title", "product_description" ]

    ###############
    ## Load Data ##
    ###############
    ## load data
    with open(config.processed_train_data_path, "rb") as f:
        dfTrain = cPickle.load(f)
    with open(config.processed_test_data_path, "rb") as f:
        dfTest = cPickle.load(f)
    ## load pre-defined stratified k-fold index
    with open("%s/stratifiedKFold.%s.pkl" % (config.data_folder, config.stratified_label), "rb") as f:
            skf = cPickle.load(f)

    ## for fitting common vocabulary
    def cat_text(x):
        res = '%s %s %s' % (x['query'], x['product_title'], x['product_description'])
        return res
    dfTrain["all_text"] = list(dfTrain.apply(cat_text, axis=1))
    dfTest["all_text"] = list(dfTest.apply(cat_text, axis=1))

    for vec_type in vec_types:
        ## save feat names
        feat_names = [ "query", "title", "description" ]
        feat_names = [ name+"_%s_%s_vocabulary" % (vec_type, vocabulary_type) for name in feat_names ]
        ## file to save feat names
        feat_name_file = "%s/basic_%s_and_cosine_sim.feat_name" % (config.feat_folder, vec_type)

        #######################
        ## Generate Features ##
        #######################
        print("==================================================")
        print("Generate basic %s features..." % vec_type)

        print("For cross-validation...")
        for run in range(config.n_runs):
            ## use 33% for training and 67 % for validation
            ## so we switch trainInd and validInd
            for fold, (validInd, trainInd) in enumerate(skf[run]):
                print("Run: %d, Fold: %d" % (run+1, fold+1))
                path = "%s/Run%d/Fold%d" % (config.feat_folder, run+1, fold+1)
                
                dfTrain2 = dfTrain.iloc[trainInd].copy()
                dfValid = dfTrain.iloc[validInd].copy()
                ## extract feat
                extract_feat(path, dfTrain2, dfValid, "valid", feat_names, column_names)

        print("Done.")

        print("For training and testing...")
        path = "%s/All" % config.feat_folder
        ## extract feat
        feat_names = extract_feat(path, dfTrain, dfTest, "test", feat_names, column_names)
        ## dump feat name
        dump_feat_name(feat_names, feat_name_file)

        print("All Done.")

In [None]:
cmd = '''git add -A
git commit -m"add data"
git push'''
os.system(cmd)

In [None]:
# %load genFeat_cooccurrence_tfidf_feat.py

"""
__file__

    genFeat_cooccurrence_tfidf.py

__description__

    This file generates the following features for each run and fold, and for the entire training and testing set.

        1. tfidf for the following cooccurrence terms
            - query unigram/bigram & title unigram/bigram
            - query unigram/bigram & description unigram/bigram
            - query id & title unigram/bigram
            - query id & description unigram/bigram

        2. corresponding lsa (svd) version features

__author__

    Chenglong Chen < c.chenglong@gmail.com >

"""

import re
import sys
import cPickle
import ngram
from feat_utils import dump_feat_name
from sklearn.decomposition import TruncatedSVD
from nlp_utils import stopwords, english_stemmer, stem_tokens, getTFV
sys.path.append("../")
from param_config import config

######################
## Pre-process data ##
######################
token_pattern = r"(?u)\b\w\w+\b"
#token_pattern = r'\w{1,}'
#token_pattern = r"\w+"
#token_pattern = r"[\w']+"
def preprocess_data(line,
                    token_pattern=token_pattern,
                    exclude_stopword=config.cooccurrence_word_exclude_stopword,
                    encode_digit=False):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
    ## tokenize
    tokens = [x.lower() for x in token_pattern.findall(line)]
    ## stem
    tokens_stemmed = stem_tokens(tokens, english_stemmer)
    if exclude_stopword:
        tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords]
    return tokens_stemmed


########################
## Cooccurrence terms ##
########################
def cooccurrence_terms(lst1, lst2, join_str):
    terms = [""] * len(lst1) * len(lst2)
    cnt =  0
    for item1 in lst1:
        for item2 in lst2:
            terms[cnt] = item1 + join_str + item2
            cnt += 1
    res = " ".join(terms)
    return res


##################
## Extract feat ##
##################
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    # ## trigram
    # join_str = "_"
    # df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    # df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    # df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## cooccurrence terms
    join_str = "X"
    # query unigram
    df["query_unigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1))
    df["query_unigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1))
    df["query_unigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1))
    df["query_unigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1))
    # query bigram
    df["query_bigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1))
    df["query_bigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1))
    df["query_bigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1))
    df["query_bigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1))
    # query id
    df["query_id_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_unigram"], join_str), axis=1))
    df["query_id_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_bigram"], join_str), axis=1))
    df["query_id_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_unigram"], join_str), axis=1))
    df["query_id_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_bigram"], join_str), axis=1))

        

if __name__ == "__main__":

    ############
    ## Config ##
    ############
    ## cooccurrence terms column names
    column_names = [
        "query_unigram_title_unigram",
        "query_unigram_title_bigram",
        "query_unigram_description_unigram",
        "query_unigram_description_bigram",
        "query_bigram_title_unigram",
        "query_bigram_title_bigram",
        "query_bigram_description_unigram",
        "query_bigram_description_bigram",
        "query_id_title_unigram",
        "query_id_title_bigram",
        "query_id_description_unigram",
        "query_id_description_bigram",
    ]
    ## feature names
    feat_names = [ name+"_tfidf" for name in column_names ]
    ## file to save feat names
    feat_name_file = "%s/intersect_tfidf.feat_name" % config.feat_folder

    ngram_range = config.cooccurrence_tfidf_ngram_range

    svd_n_components = 100

    ###############
    ## Load Data ##
    ###############
    ## load data
    with open(config.processed_train_data_path, "rb") as f:
        dfTrain = cPickle.load(f)
    with open(config.processed_test_data_path, "rb") as f:
        dfTest = cPickle.load(f)
    ## load pre-defined stratified k-fold index
    with open("%s/stratifiedKFold.%s.pkl" % (config.data_folder, config.stratified_label), "rb") as f:
            skf = cPickle.load(f)

    #######################
    ## Generate Features ##
    #######################
    print("==================================================")
    print("Generate co-occurrence tfidf features...")

    ## get cooccurrence terms
    extract_feat(dfTrain)
    extract_feat(dfTest)

    ######################
    ## Cross validation ##
    ######################
    print("For cross-validation...")
    for run in range(config.n_runs):
        ## use 33% for training and 67 % for validation
        ## so we switch trainInd and validInd
        for fold, (validInd, trainInd) in enumerate(skf[run]):
            print("Run: %d, Fold: %d" % (run+1, fold+1))
            path = "%s/Run%d/Fold%d" % (config.feat_folder, run+1, fold+1)
                
            for feat_name,column_name in zip(feat_names, column_names):
                print "generate %s feat" % feat_name
                ## tfidf
                tfv = getTFV(ngram_range=ngram_range)
                X_tfidf_train = tfv.fit_transform(dfTrain.iloc[trainInd][column_name])
                X_tfidf_valid = tfv.transform(dfTrain.iloc[validInd][column_name])
                with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(X_tfidf_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(X_tfidf_valid, f, -1)

                ## svd
                svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
                X_svd_train = svd.fit_transform(X_tfidf_train)
                X_svd_test = svd.transform(X_tfidf_valid)
                with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_train, f, -1)
                with open("%s/valid.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_test, f, -1)

    print("Done.")


    #################
    ## Re-training ##
    #################
    print("For training and testing...")
    path = "%s/All" % config.feat_folder
    for feat_name,column_name in zip(feat_names, column_names):
        print "generate %s feat" % feat_name
        tfv = getTFV(ngram_range=ngram_range)
        X_tfidf_train = tfv.fit_transform(dfTrain[column_name])
        X_tfidf_test = tfv.transform(dfTest[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_tfidf_train, f, -1)
        with open("%s/test.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_tfidf_test, f, -1)

        ## svd
        svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
        X_svd_train = svd.fit_transform(X_tfidf_train)
        X_svd_test = svd.transform(X_tfidf_test)
        with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
            cPickle.dump(X_svd_train, f, -1)
        with open("%s/test.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
            cPickle.dump(X_svd_test, f, -1)

    print("Done.")

    ## save feat names
    print("Feature names are stored in %s" % feat_name_file)
    feat_names += [ "%s_individual_svd%d"%(f, svd_n_components) for f in feat_names ]
    dump_feat_name(feat_names, feat_name_file)

    print("All Done.")

In [None]:
cmd = '''git add -A
git commit -m"add data"
git push'''
os.system(cmd)