In [None]:
!mkdir data
!mkdir data/in
!mkdir data/out

In [None]:
!pip install pandas==1.0.5

Collecting pandas==1.0.5
  Downloading pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 27.4 MB/s 
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas~=1.1.0; python_version >= "3.0", but you have pandas 1.0.5 which is incompatible.[0m
Successfully installed pandas-1.0.5


Name: pandas
Version: 1.0.5
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: /usr/local/lib/python3.7/dist-packages
Requires: python-dateutil, numpy, pytz
Required-by: xarray, vega-datasets, statsmodels, sklearn-pandas, seaborn, pymc3, plotnine, pandas-profiling, pandas-gbq, pandas-datareader, mlxtend, mizani, holoviews, gspread-dataframe, google-colab, fix-yahoo-finance, fbprophet, fastai, cufflinks, cmdstanpy, arviz, altair


In [None]:
!pip show pandas

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
DATA_IN = 'data/in'
DATA_OUT = '/content/drive/MyDrive/quotebank_filtered'

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def prepare_tokens(tokens, stemmer):
  return map(
      # to avoid any multithreading issues, stemmer is always passed from outside
      stemmer.stem, 
      map(lambda word: word.lower(), tokens)
  )

def is_topic_related(text, term_set_to_look_for, stemmer):
  tokens = prepare_tokens(word_tokenize(text), stemmer)
  return bool(set(tokens) & term_set_to_look_for)

class ChunkProcessingFilter:
  def __init__(self, *terms_to_look_for):
    self.set_to_look_for = set(
        prepare_tokens(terms_to_look_for, PorterStemmer()))

  def __call__(self, chunk):
    stemmer = PorterStemmer()
    mask = chunk.apply(
        lambda row: is_topic_related(
            row.quotation, self.set_to_look_for, stemmer),
        axis=1)
    return chunk[mask]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import pandas as pd
import itertools
from functools import reduce
from multiprocessing import Pool

def filter_stream_by_terms(in_file, terms, chunksize, poolworkers=2):
  processing_filter = ChunkProcessingFilter(*terms)
  df_reader = pd.read_json(in_file, lines=True, compression='bz2', chunksize=200000)
  try:
    with Pool(poolworkers) as pool:
      return reduce(
        lambda acc, new_el: pd.concat([acc, new_el]),
        pool.imap_unordered(processing_filter, df_reader)
      )
  finally:
    df_reader.close()

In [None]:
quotebank_files_of_years = {
    # 2020: 'https://zenodo.org/record/4277311/files/quotes-2020.json.bz2?download=1',
    2019: 'https://zenodo.org/record/4277311/files/quotes-2019.json.bz2?download=1',
    2018: 'https://zenodo.org/record/4277311/files/quotes-2018.json.bz2?download=1',
    2017: 'https://zenodo.org/record/4277311/files/quotes-2017.json.bz2?download=1',
    2016: 'https://zenodo.org/record/4277311/files/quotes-2016.json.bz2?download=1',
    2015: 'https://zenodo.org/record/4277311/files/quotes-2015.json.bz2?download=1'
}

In [None]:
import os
from google.colab import files

def download_file(url, out_file):
  """ Downloads file from the given url """
  os.system(f'wget {url} -O {out_file}')

TERMS = ['vegan', 'vegetarian', 'veget', 'plant-based']
CHUNKSIZE = 200000

for year, file_url in quotebank_files_of_years.items():
  # Specify download path
  file_path = f'{DATA_IN}/quotebank_{year}.json.bz2'
  print(f'Processing Quotebank for year {year}')
  # Download file
  print(f'\tDownloading file: {file_url}')
  download_file(file_url, file_path)
  print(f'\tThe file downloaded to {file_path}')
  # Filter the quotebank of the given year
  print('\tProcessing the file...')
  filtered_data = filter_stream_by_terms(file_path, TERMS, CHUNKSIZE)
  # Save the filtered dataframe to a new file
  out_file = f'{DATA_OUT}/quotebank_filtered_{year}.json.bz2'
  print(f'\tSaving the result to {out_file}')
  filtered_data.to_json(out_file, 
                        compression='bz2', 
                        orient='records', 
                        lines=True)
  # Download the filtered file from Google Colab
  # files.download(out_file) 
  # Remove the original file of the quotebank for the given year
  print(f'\tDeleting {file_path}...')
  os.remove(file_path)

Processing Quotebank for year 2019
	Downloading file: https://zenodo.org/record/4277311/files/quotes-2019.json.bz2?download=1
	The file downloaded to data/in/quotebank_2019.json.bz2
	Processing the file...
	Saving the result to /content/drive/MyDrive/quotebank_filtered/quotebank_filtered_2019.json.bz2
	Deleting data/in/quotebank_2019.json.bz2...
Processing Quotebank for year 2018
	Downloading file: https://zenodo.org/record/4277311/files/quotes-2018.json.bz2?download=1
	The file downloaded to data/in/quotebank_2018.json.bz2
	Processing the file...
