In [None]:
import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client

In [None]:
client = Client(n_workers=6)
client


 
 <div class="alert alert-block alert-warning">
For this project we decided to use <a href="https://www.dask.org/" > DASK </a>
 to carry out a big chunk of the computations </br>
Several reasons motivated us to use DASK (instead of PANDAS for example): </br>
<p>
 - DASK is optimized to work with very big files: all the computations are carried out lazily and the whole file is never loaded to memory. The computations are only done when needed and after heavy optimization </p>
 
 <p>
 - DASK makes it very easy to create a local cluster with several machines to make the computationally expensive operations faster (we therefore don't need to deploy our data to something like AWS)
  </p>
  
  <p>
 - DASK gives us access to a great API which makes it very easy to identify bottlenecks and optimize our computations
   </p>
    
 </div>

## Pre-Processing Youtube Metadata

<div class="alert alert-block alert-info">
    We take the matadata which is in json format and transform it to parquet to load it in DASK </br>
    <b>We decided to use Parquet since it is a columnar data-storage format which also uses compression which allows us to get smaller file and faster queries </b> </br>
    We take the matadata which is in json format and transform it to parquet to load it in DASK </br>
    In order to open the document we use the chunksize parameter which allows us to load the file in chunks. </br>
We then save each one of the chunks as parquet </br>
    




In [None]:
df = pd.read_json("./Data/video_metadata/yt_metadata_en.jsonl.gz", compression="gzip", lines=True, chunksize=100_000)

for i, chunk in enumerate(df):
    chunk.to_parquet(f"./Data/video_metadata/parquet/{i}.parquet")
    print(i, end="\r")

## Comment Metadata

<div class="alert alert-block alert-info">
The comment metadata was already available in TSV format. </br>
We therefore simply read it using dask and rewrote it in Parquet to make it easier and more efficient to query the data in the future </br>
</div>

In [None]:
df = dd.read_csv("./Data/comment_data/youtube_comments.tsv", sep="\t")

# df.to_parquet('./Data/comment_data/parquet/')

In [None]:
df.head()

## Time Series Metadata

<div class="alert alert-block alert-info">
The time series metadata was already available in TSV format. </br>
We therefore simply read it using dask and rewrote it in Parquet to make it easier and more efficient to query the data in the future (same as for comments) </br>
</div>

In [None]:
df = dd.read_csv("./Data/time_series_data/df_timeseries_en.tsv", sep="\t")

# df.to_parquet('./Data/time_series_data/parquet/')

In [None]:
df.head()