## import packages

In [2]:
from preprocess import *
from top_k_words import *
from performance import *

%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


## download data
`!python fetch_data.py`

## tokenize

In [3]:
small_file_name = 'dataset/data_300MB.txt'
medium_file_name = 'dataset/data_2.5GB.txt'
large_file_name = 'dataset/data_16GB.txt'

stopwords_file = 'stopwords.txt'
stopwords = get_stopwords(stopwords_file)

k = 10

In [19]:
small_file_tokens = tokenize_parallel_multiprocessing(small_file_name, stopwords, num_processes=36)

In [4]:
medium_file_tokens = tokenize_parallel_multiprocessing(medium_file_name, stopwords, num_processes=36)

In [5]:
large_file_tokens = tokenize_parallel_multiprocessing(large_file_name, stopwords, num_processes=36)

In [None]:
def tokenize_stream(file_name, stopwords):
    tokens = []
    with open(file_name, 'r') as f:
        for line in f:
            line = line.strip().lower()
            words = line.split(' ')
            words = [w for w in words if w not in stopwords and w.isalpha()]
            tokens.extend(words)
    return tokens

In [None]:
large_file_tokens = tokenize_stream(large_file_name, stopwords, num_processes=36)

In [20]:
print(f"tokens length of data_2.5GB.txt: {len(medium_file_tokens)}")

tokens length of data_2.5GB.txt: 194919696


In [None]:
print(f"tokens length of data_16GB.txt: {len(large_file_tokens)}")

## analysis

### Approach 1: Sorting

#### A text file of size 300MB

In [4]:
sorting(small_file_tokens, k)

['european',
 'mr',
 'would',
 'also',
 'must',
 'commission',
 'member',
 'like',
 'one',
 'parliament']

In [5]:
%%timeit
sorting(small_file_tokens, k)

1.94 s ± 19.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
start_time = time.time()
run_start_time = time.process_time()
sorting(small_file_tokens, k)
end_time = time.time()
run_end_time = time.process_time()

print(f"sorting small 300MB file - wall time takes {end_time - start_time} seconds")
print(f"sorting small 300MB file - run time takes {run_end_time - run_start_time} seconds")

sorting small 300MB file - wall time takes 2.015151023864746 seconds
sorting small 300MB file - run time takes 2.0158687519999994 seconds


In [6]:
%%memit
sorting(small_file_tokens, k)

peak memory: 2050.93 MiB, increment: 0.00 MiB


#### A text file of size 2.5GB

In [5]:
sorting(medium_file_tokens, k)

['said', 'would', 'one', 'new', 'also', 'last', 'de', 'two', 'people', 'first']

In [6]:
%%timeit
sorting(medium_file_tokens, k)

1min 40s ± 3.36 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
start_time = time.time()
run_start_time = time.process_time()
sorting(medium_file_tokens, k)
end_time = time.time()
run_end_time = time.process_time()

print(f"sorting medium 2.5G file - wall time takes {end_time - start_time} seconds")
print(f"sorting medium 2.5G file - run time takes {run_end_time - run_start_time} seconds")

sorting medium 2.5G file - wall time takes 80.94257712364197 seconds
sorting medium 2.5G file - run time takes 70.25941499999999 seconds


In [7]:
%%memit
sorting(medium_file_tokens, k)

peak memory: 2279.48 MiB, increment: -237.76 MiB


#### A text file of size 16GB

In [None]:
sorting(large_file_tokens, k)

In [None]:
%%timeit
sorting(large_file_tokens, k)

In [None]:
%%memit
sorting(large_file_tokens, k)

### Approach 2: Max Heap (Priority Queue)

#### A text file of size 300MB

In [7]:
maxHeap(small_file_tokens, k)

['european',
 'mr',
 'would',
 'also',
 'must',
 'commission',
 'member',
 'like',
 'one',
 'parliament']

In [8]:
%%timeit
maxHeap(small_file_tokens, k)

1.94 s ± 26.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
start_time = time.time()
run_start_time = time.process_time()
maxHeap(small_file_tokens, k)
end_time = time.time()
run_end_time = time.process_time()

print(f"maxHeap small 300MB file - wall time takes {end_time - start_time} seconds")
print(f"maxHeap small 300MB file - run time takes {run_end_time - run_start_time} seconds")

maxHeap small 300MB file - wall time takes 2.484095811843872 seconds
maxHeap small 300MB file - run time takes 2.485721170999999 seconds


In [9]:
%%memit
maxHeap(small_file_tokens, k)

peak memory: 2050.95 MiB, increment: 0.00 MiB


#### A text file of size 2.5GB

In [8]:
maxHeap(medium_file_tokens, k)

['said', 'would', 'one', 'new', 'also', 'last', 'de', 'two', 'people', 'first']

In [9]:
%%timeit
maxHeap(medium_file_tokens, k)

1min 34s ± 11.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
start_time = time.time()
run_start_time = time.process_time()
maxHeap(medium_file_tokens, k)
end_time = time.time()
run_end_time = time.process_time()

print(f"maxHeap medium 2.5G file - wall time takes {end_time - start_time} seconds")
print(f"maxHeap medium 2.5G file - run time takes {run_end_time - run_start_time} seconds")

maxHeap medium 2.5G file - wall time takes 98.83587622642517 seconds
maxHeap medium 2.5G file - run time takes 76.18272200000001 seconds


In [10]:
%%memit
maxHeap(medium_file_tokens, k)

peak memory: 2670.27 MiB, increment: -43.81 MiB


#### A text file of size 16GB

In [None]:
maxHeap(large_file_tokens, k)

In [None]:
%%timeit
maxHeap(large_file_tokens, k)

In [None]:
%%memit
maxHeap(large_file_tokens, k)

### Approach 3: Bucket Sort

#### A text file of size 300MB

In [10]:
bucketSort(small_file_tokens, k)

['european',
 'mr',
 'would',
 'also',
 'must',
 'commission',
 'member',
 'like',
 'one',
 'parliament']

In [11]:
%%timeit
bucketSort(small_file_tokens, k)

2.03 s ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
start_time = time.time()
run_start_time = time.process_time()
bucketSort(small_file_tokens, k)
end_time = time.time()
run_end_time = time.process_time()

print(f"bucketSort small 300MB file - wall time takes {end_time - start_time} seconds")
print(f"bucketSort small 300MB file - run time takes {run_end_time - run_start_time} seconds")

bucketSort small 300MB file - wall time takes 3.093019723892212 seconds
bucketSort small 300MB file - run time takes 3.095347041 seconds


In [12]:
%%memit
bucketSort(small_file_tokens, k)

peak memory: 2050.98 MiB, increment: 0.00 MiB


#### A text file of size 2.5GB

In [11]:
bucketSort(medium_file_tokens, k)

['said', 'would', 'one', 'new', 'also', 'last', 'de', 'two', 'people', 'first']

In [16]:
%%timeit
bucketSort(medium_file_tokens, k)

1min 38s ± 10.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
start_time = time.time()
run_start_time = time.process_time()
bucketSort(medium_file_tokens, k)
end_time = time.time()
run_end_time = time.process_time()

print(f"bucketSort medium 2.5G file - wall time takes {end_time - start_time} seconds")
print(f"bucketSort medium 2.5G file - run time takes {run_end_time - run_start_time} seconds")

bucketSort medium 2.5G file - wall time takes 383.11965799331665 seconds
bucketSort medium 2.5G file - run time takes 259.4682889999999 seconds


In [13]:
%%memit
bucketSort(medium_file_tokens, k)

#### A text file of size 16GB

In [None]:
bucketSort(large_file_tokens, k)

In [None]:
%%timeit
bucketSort(large_file_tokens, k)

In [None]:
%%memit
bucketSort(large_file_tokens, k)

## rerun and plot performances on W&B
`!python performance.py`

### log in to W&B

In [14]:
init_wanb_run("data engineering", small_file_tokens, small_file_name)

[34m[1mwandb[0m: Currently logged in as: [33melenaliao[0m ([33mdeee[0m). Use [1m`wandb login --relogin`[0m to force relogin


### record performances

In [15]:
random.seed(1995)

In [16]:
bucketSort_performance(small_file_tokens, k)

In [17]:
maxHeap_performance(small_file_tokens, k)

In [18]:
sorting_performance(small_file_tokens, k)

In [None]:
wandb.finish()

In [None]:
bucketSort_performance(large_file_name, k)

In [None]:
maxHeap_performance(large_file_name, k)

In [None]:
sorting_performance(large_file_name, k)