In [2]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')
from scipy import sparse
import dill
import re
import os
import sys

from sklearn import base

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [5]:
def _get_df(file, **kwargs):
    df = pd.read_feather(file, **kwargs)
    _within_range(df)
    df.dropna(subset=['posted_date'], inplace=True)
    return df

def _within_range(df):
    start = pd.datetime(2017, 1, 1)
    end = pd.datetime(2018, 7, 1)
    truth = ~df['posted_date'].isin(pd.date_range(start, end))
    df.drop(df[truth].index, inplace=True)

class TextPreProcess(base.BaseEstimator, base.TransformerMixin):
    """
    Input  : document list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    
    def __init__(self, ignore):
        self.en_stop = set(stopwords.words('english')) # English stop words list
        self.tokenizer = RegexpTokenizer(r'[a-z]+&?[a-z]+')
        self.lemmatizer = WordNetLemmatizer()
        self.replace = ignore
    
    def _process(self, text):
        raw = text.lower()
        for key, val in self.replace.items():
            raw = re.sub(key, val, raw)
        tokens = self.tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in self.en_stop]
        lemma_tokens = [self.lemmatizer.lemmatize(i) for i in stopped_tokens]
        output = ' '.join(lemma_tokens)
        return output
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = X.apply(self._process)
        return output
    
def clean_titles(files, ignore_dict, columns=['title', 'region', 'posted_date']):
    tpp = TextPreProcess(ignore_dict)

    i = 0
    for file in tqdm(files, desc='clean titles'):
        df = _get_df(file, columns=columns)
        df['title'] = tpp.fit_transform(df['title'])
        df.reset_index(drop=True).to_feather('cleaned_cache/data_{}.feather'.format(i))
        i += 1
    return None

In [6]:
%%time

folder = 'raw_cache'
files = [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.feather')]

ignore = pd.read_feather('other_data/ignore.feather')
ignore_dict = ignore.set_index('regex').to_dict()['sub']

clean_titles(files, ignore_dict=ignore_dict)

HBox(children=(IntProgress(value=0, description='clean titles', max=41, style=ProgressStyle(description_width=…




## Downsample data sets

In [15]:
def downsample(files, fraction=20):
    dfs = []
    for file in tqdm(files, desc='feathers'):
        df = pd.read_feather(file)
        dfs.append(df.sample(df.shape[0]//fraction))
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [10]:
%%time

folder = 'cleaned_cache'
files = [os.path.join(folder, 'data_{}.feather'.format(i)) for i in range(len(os.listdir(folder)))]

df = downsample(files)

HBox(children=(IntProgress(value=0, description='feathers', max=41, style=ProgressStyle(description_width='ini…


CPU times: user 26.9 s, sys: 13.2 s, total: 40.2 s
Wall time: 1min 38s
