In [1]:
import pandas as pd
import numpy as np
import re

### Use module to load data

In [2]:
# Append path to custom code
import sys
sys.path.append("../python/")

# Load custom code
import custom as preprocess

# Connect to database
dk = preprocess.DatabaseKick()

cur = dk.connect()

In [3]:
cur.execute("SELECT id, concat_ws(name, blurb) FROM info")
rows = cur.fetchall()

df = pd.DataFrame(rows, columns=["id", "document"])
df.head()

Unnamed: 0,id,document
0,1312331512,We have a fully developed 2D animated series t...
1,80827270,A sci-fi fantasy 2.5D anime styled series abou...
2,737219121,"A film created entirely out of paper, visual e..."
3,1946566454,A Tale of Faith is an animated short film base...
4,591797827,Honeybee is a cartoon about a girl who can tal...


In [4]:
dk.disconnect()

### Preprocess all documents

In [5]:
# Original function
text_processing = preprocess.text_processing

# Decorator function (where the input & output are functions)
def join_output(func):
    def func_wrapper(text, *arg, **karg):
        return ' '.join(func(text, *arg, **karg))
    return func_wrapper

# Decorate function
text_processing = join_output(text_processing)

# Use function
text_processing(df['document'][0], method="stem")

'fulli develop anim seri requir profession anim first homeanim ep onlin'

In [6]:
%%time

features = df['document'].apply(lambda x: text_processing(x, method="stem"))

CPU times: user 8min 38s, sys: 51.7 s, total: 9min 30s
Wall time: 9min 38s


In [7]:
# for text in document in the 'df' dataframe
# replace text newlines (\n, \r) with whitespace
# save to 'document_file'

document_file = "../../data/documents/doc.txt"

def doc_to_string(doc):
    if doc == None:
        return("")
    else:
        return(re.sub("[\n\r]", "", doc))

df['document'] = df['document'].apply(lambda x: doc_to_string(x))

df.to_csv(document_file, sep="\t", header=False, index=False)

In [8]:
%%time

# for each line in 'document_file'
# update the document part with the preprocessed text
# save to 'processed_file'

processed_file = "../../data/documents/doc_proc2.txt"
with open(processed_file, 'w') as out:
    with open(document_file) as f:
        for line in f:
            try:
                doc_id, doc = line.split("\t", 1)
                doc = text_processing(doc, method="stem")

                out.write('\t'.join([doc_id, doc + "\n"]))
            except:
                print(line)
                break

CPU times: user 8min 31s, sys: 50.2 s, total: 9min 21s
Wall time: 9min 26s


### Make count matrix

- http://stackoverflow.com/questions/30640970/how-to-make-feature-vector-from-the-lists
- http://scikit-learn.org/stable/modules/feature_extraction.html

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

In [10]:
mat = count_vect.fit_transform(features)
mat.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
type(mat)

scipy.sparse.csr.csr_matrix

### Save data
- http://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format

In [33]:
import numpy as np
from scipy.sparse import csr_matrix

def save_sparse_csr(filename, array):
    # note that .npz extension is added automatically
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    # here we need to add .npz extension manually
    loader = np.load(filename + '.npz')
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [34]:
wc_file = "../../data/documents/word_counts.txt"
save_sparse_csr(wc_file, mat)

In [40]:
mat_reload = load_sparse_csr(wc_file)
mat_reload.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])