In [1]:
import fasttext
import numpy as np
import json
import pandas as pd
import matplotlib as plt
from nltk.stem import PorterStemmer
import os.path
import time
from IPython.display import clear_output
import gc
import csv

### Data preproccessing
In this step we will change the compressed json format into csv files for any given year

In [2]:
def process_json_chunk(chunk, csv_file_name):
    """ 
    Explanation:
    This function will take in some chunk of data and filter out the not wanter columns
    and do some small processing on the type and then write this to a csv file.
    
    INPUT:
        chunk:                  A dataframe chunk
        csv_file_name:          Filepath to the resulting csv file
        
    """
    print(f'Processing chunk with {len(chunk)} rows')
    #print(chunk.columns)

    csv_lines = []

    # Changing the dtype of the date from Timestamp to string
    chunk['date'] = chunk['date'].astype("str")

    for index, row in chunk.iterrows():
        current_csv_line = [row['quoteID'], 
                            row['quotation'], 
                            row['speaker'], 
                            row['date'], 
                            str(row['numOccurrences'])]


        # Appending the current csv line to the csv_lines list, so we can add
        # chunksize number of lines at the same time
        csv_lines.append(current_csv_line)

        #with open(csv_file_name, 'a') as file:
        #    file.writelines(csv_lines)

    with open(csv_file_name, 'a', encoding='UTF8', newline='') as f: 
        writer = csv.writer(f, delimiter=';')
        writer.writerows(csv_lines)

def compressed_json_to_csv(json_file_path, csv_file_path):
    """ 
    Explanation:
    This function will take in a json_file_path, do some filtering and then write the results
    in to the csv_file_name path
    
    INPUT:
        json_file_path:         Filepath to the compressed json file
        csv_file_path:          Filepath to the resulting csv file
        
    """
    
    # Check if the new csv file exists already
    if not os.path.isfile(csv_file_path):
        f = open(csv_file_path, 'w', encoding='UTF8', newline='')
        writer = csv.writer(f, delimiter=';')
        writer.writerow(['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences']) 
        f.close()
    
    
    # Will chunkwise read in lines from the compressed json file
    with pd.read_json(json_file_path, lines=True, compression='bz2', chunksize=1000000) as df_reader:
        for chunk in df_reader:
            process_json_chunk(chunk, csv_file_path)

In [3]:
def stem_string_to_list(string):
    """ 
    EXPLANATION:
    This function will take a string and turn it into a stemmed list of words
    
    INPUT:
        string:    string of words
    
    OUTPUT:
        return:    list of words that are stemmed
    """
    ps = PorterStemmer()
    return [ps.stem(word) for word in string.split()]

def stem_list_to_list(arr):
    """ 
    EXPLANATION:
    This function will take a string and turn it into a stemmed list of words
    
    INPUT:
        arr:    list of words that are stemmed
    
    OUTPUT:
        return:    list of words that are stemmed
    """
    
    ps = PorterStemmer()
    return [ps.stem(word) for word in arr]

In [9]:
add_list = ["apple inc", "tim cook", "steve jobs", "iphone", "ipad", "imac", "apple watch", "macbook", "macbook pro", "aapl", "mac mini"]
check_list = ["apple"]
exclude_list = ["fruit", 'banana', 'orange', 'strawberries', "watermelon", 'lemons']

In [11]:
def process_csv_chunk(chunk, stem_add_list, stem_check_list, stem_exclude_list, filename):
    """ 
    EXPLANATION:
    This function will filter a chunk of data, based on the stemmed lists, 
    and write the result to the filename filepath
    
    INPUT:
        chunk:              Chunk of data in a dataframe format
        stem_add_list:      Stemmed list of sentences that will NOT be filtered out
        stem_check_list:    Stemmed list of sentences that will MAYBE not be filtered out
        stem_exlude_list:   Stemmed list of steneces that will be filtered out
        filename:           Filepath of where the filtered data will be written to
    """
    
    print(f'Processing chunk with {len(chunk)} rows')
    csv_lines = []
    
    # Iterates over the chunk - row by row
    for index, row in chunk.iterrows():
        stemmed_quote_list = frozenset(stem_string_to_list(row['quotation']))

        #========================================================
        # Starting to check if quote contains any must have words
        if not stemmed_quote_list.isdisjoint(stem_add_list):
            csv_lines.append(row)

        #========================================================
        # Starting to check if quote contains any could have words
        elif (not stemmed_quote_list.isdisjoint(stem_check_list)) and stemmed_quote_list.isdisjoint(stem_exclude_list):
            csv_lines.append(row)
            
    
    # Writing all the csv_lines that where not filtered out to the csv file
    with open(filename, 'a', encoding='UTF8', newline='') as f: 
        writer = csv.writer(f, delimiter=';')
        writer.writerows(csv_lines)


In [6]:
def filter_out_apple_from_csv(csv_file_path, filtered_csv_file_path, add_list, check_list, exclude_list):
    """ 
    EXPLANATION:
    This function will filter out data from a csv_file_path, and write the filtered data to 
    a new filtered_csv_file_path.
    
    INPUT:
        csv_file_path:             Filepath to where the UNFILTERED data is located
        filtered_csv_file_path:    Filepath to where the FILTERED data will be written
        stem_add_list:             Stemmed list of sentences that will NOT be filtered out
        stem_check_list:           Stemmed list of sentences that will MAYBE not be filtered out
        stem_exlude_list:          Stemmed list of steneces that will be filtered out
        filename:                  Filepath of where the filtered data will be written to
    """
    
    # Setup variables
    ps = PorterStemmer()
    
    
    # Checks if all the files are in order
    if not os.path.isfile(filtered_csv_file_path):
        # Creates a new file it it does not exists
        f = open(filtered_csv_file_path, 'w')
        writer = csv.writer(f, delimiter=';')
        writer.writerow(['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences']) 
        f.close()
    if not os.path.isfile(csv_file_path):
        raise Exception("You will need to input valid csv_file_path!")
    
    # Stemming the lists we will use to filter quoteID;quotation;speaker;date;numOccurrences
    stem_add_set = frozenset(stem_list_to_list(add_list))
    stem_check_set = frozenset(stem_list_to_list(check_list))
    stem_exclude_set = frozenset(stem_list_to_list(exclude_list))
    
    
    # Start the iterative processing of the csv files
    for chunk in pd.read_csv(csv_file_path, chunksize=200000, delimiter=';'):
        process_csv_chunk(chunk, stem_add_list, stem_check_list, stem_exclude_list, filtered_csv_file_path)
    

In [7]:
compressed_json_to_csv("data/quotes-2020.json.bz2", "data/quotes-2020-first-filter.csv")

Processing chunk with 1000000 rows
Processing chunk with 1000000 rows
Processing chunk with 1000000 rows
Processing chunk with 1000000 rows
Processing chunk with 1000000 rows
Processing chunk with 244449 rows


In [None]:
filter_out_apple_from_csv("data/quotes-2020-first-filter.csv", 
                          "data/quotes-2020-apple-filter_2.csv", 
                          add_list, check_list, exclude_list)

Processing chunk with 200000 rows
Processing chunk with 200000 rows
Processing chunk with 200000 rows
Processing chunk with 200000 rows
