# <span style="color:blue">Shuffle and Split bins</span>

The raw data provided by UOC includes data from Jul 2019 until Dec 2019.  
  
The purpose of this notebook is:  
- Generate data from Jan 2019 until Jun 2019  
- Split original data (months) in smaller datasets   

### Load packages

In [1]:
import csv, glob, json, os
import pandas as pd
from flatten_json import flatten

### Constants

In [2]:
# for reproducible results
random_state = 33
# if debug is True, messages during the process are generated
debug = False
# if the test is True, the process runs on a smaller subset of raw data (json files)
test = False

### Paths

In [3]:
# path where the doc & meta csv files are stored
if test:
    path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test'
else:
    path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins'

### Function to shuffle and split months/bins

Example:  
- From the input dataset \data\bins\doc\2019_Dec (original number of rows)  
- Generate the output datasets \data\bins\doc\2019_Dec (original number of rows / 2)  
- Generate the output datasets \data\bins\doc\2019_Jun (original number of rows / 2)  
  
  
- From the input dataset \data\bins\meta\2019_Dec (original number of rows)  
- Generate the output datasets \data\bins\meta\2019_Dec (original number of rows / 2)  
- Generate the output datasets \data\bins\meta\2019_Jun (original number of rows / 2)  

In [4]:
def shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2):

    # path + name of the file that contains the documents
    path_1 = os.path.join(path, 'doc', yyyy_MMM_1)
    file_1 = os.path.join(path, 'doc', yyyy_MMM_1, yyyy_MMM_1)

    # path + name of the file that contains the documents
    path_2 = os.path.join(path, 'doc', yyyy_MMM_2)
    file_2 = os.path.join(path, 'doc', yyyy_MMM_2, yyyy_MMM_2)
    
    # load documents
    df_doc = pd.read_csv(file_1 + '_original.csv', names=['id_doc', 'content'])
    # shuffle dataset
    df_doc = df_doc.sample(frac=1, random_state=random_state)
    # number of rows in dataset
    num_rows = len(df_doc)

    # split doc dataset (first half)
    df_doc_head = df_doc.head(int(num_rows/2))
    df_doc_head.to_csv(file_1 + '.csv', index=False, header=False)
    
    # split doc dataset (second half)
    df_doc_tail = df_doc.tail(int(num_rows/2))
    # check if document subfolder doc\yyyy_mmm exists, if not create it
    if not os.path.exists(os.path.join(path_2)):
        os.makedirs(os.path.join(path_2))
    df_doc_tail.to_csv(file_2 + '.csv', index=False, header=False)
    
    # path + name of the file that contains the metadata
    path_1 = os.path.join(path, 'meta', yyyy_MMM_1)
    file_1 = os.path.join(path, 'meta', yyyy_MMM_1, yyyy_MMM_1)

    # path + name of the file that contains the metadata
    path_2 = os.path.join(path, 'meta', yyyy_MMM_2)
    file_2 = os.path.join(path, 'meta', yyyy_MMM_2, yyyy_MMM_2)
    
    # split meta dataset (first half)
    df_meta = pd.read_csv(file_1 + '_original.csv', names=['id_meta', 'file', 'author_followers', 'author_full_name', 'author_id', 'author_image', 'author_name', 'author_url', 'date', 'date_from_provider', 'id', 'id_from_provider', 'image_url', 'link', 'location_latitude', 'location_longitude', 'place_country_code', 'place_name', 'place_street_address', 'provider', 'social_likes', 'social_replies'])
    df_meta_head = pd.merge(left=df_doc_head, right=df_meta, left_on='id_doc', right_on='id_meta')
    del df_meta_head['id_doc']
    del df_meta_head['content']
    df_meta_head.to_csv(file_1 + '.csv', index=False, header=False)
    
    # split meta dataset (second half)
    df_meta_tail = pd.merge(left=df_doc_tail, right=df_meta, left_on='id_doc', right_on='id_meta')
    del df_meta_tail['id_doc']
    del df_meta_tail['content']
    # check if document subfolder meta\yyyy_mmm exists, if not create it
    if not os.path.exists(os.path.join(path_2)):
        os.makedirs(os.path.join(path_2))
    df_meta_tail.to_csv(file_2 + '.csv', index=False, header=False)

### Split months/bins

From **2019_Jul_original** generate **2019_Jul** and **2019_Jan**  
From **2019_Aug_original** generate **2019_Aug** and **2019_Feb**  
From **2019_Sep_original** generate **2019_Sep** and **2019_Mar**  
From **2019_Oct_original** generate **2019_Oct** and **2019_Apr**  
From **2019_Nov_original** generate **2019_Nov** and **2019_May**  
From **2019_Dec_original** generate **2019_Dec** and **2019_Jun**  

In [5]:
yyyy_MMM_1 = '2019_Jul'
yyyy_MMM_2 = '2019_Jan'
shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2)

yyyy_MMM_1 = '2019_Aug'
yyyy_MMM_2 = '2019_Feb'
shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2)

yyyy_MMM_1 = '2019_Sep'
yyyy_MMM_2 = '2019_Mar'
shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2)

yyyy_MMM_1 = '2019_Oct'
yyyy_MMM_2 = '2019_Apr'
shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2)

yyyy_MMM_1 = '2019_Nov'
yyyy_MMM_2 = '2019_May'
shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2)

yyyy_MMM_1 = '2019_Dec'
yyyy_MMM_2 = '2019_Jun'
shuffle_split(path, yyyy_MMM_1, yyyy_MMM_2)