# <span style="color:blue">Load data (files in folder influencers_1_de_2)</span>

The purpose of this notebook is to load the raw data provided; inside folders **\data\influencers_1_de_2** and **\data\influencers_2_de_2**.  
  
This code generates several folders depending on the year and month of the document. Inside the folder named **data\bins\**, several subfolders are created with the following name structure **doc\yyyy_MMM\** and **meta\yyyy_MMM\**.  
  
The files inside the **doc** subfolders contain the text of the messages from Instagram.  

The files inside the **meta** subfolders contain the metadata (date, author name, likes, etc.) of the messages from Instagram.  
    
Inside these **yyyy_MMM** subfolders a **csv** file with the same name structure is created **yyyy_MMM.csv**.  
  
The final structure of folders and files will be:  
- data\bins\doc\2019_Jan\2019_Jan.csv  
- data\bins\doc\2019_Feb\2019_Feb.csv  
- ...  
- data\bins\doc\2019_Nov\2019_Nov.csv  
- data\bins\doc\2019_Dec\2019_Dec.csv  
  
  
- data\bins\meta\2019_Jan\2019_Jan.csv  
- data\bins\meta\2019_Feb\2019_Feb.csv  
- ...  
- data\bins\meta\2019_Nov\2019_Nov.csv  
- data\bins\meta\2019_Dec\2019_Dec.csv  

In [1]:
#https://github.com/Jokiph3r/Jsonl-to-csv/blob/master/jsonl-to-csv.py

### Load packages

In [2]:
import csv, glob, json, os
import pandas as pd
from flatten_json import flatten

### Constants

In [3]:
# if debug is True, messages during the process are generated
debug = False
# if the test is True, the process runs on a smaller subset of raw data (json files)
test = False
# first id value to be assaigned 
id = 0

month_index_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

### Paths (input and output)

In [4]:
# input data - path jsonl files
if test:
    input_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\influencers_test_1_de_2'
else:
    input_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\influencers_1_de_2'

# output data - path where the csv file is generated
if test:
    output_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test'
else:
    output_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins'

### Loading data process

In [5]:
# reading all jsonl files
files = [f for f in glob.glob(input_path + "**/*.jsonl", recursive=True)]

for f in files:
    with open(f, mode='r') as file:
        for line in file:
            # flatten json files
            data = json.loads(line)
            data_flatten = flatten(data)
            # debug line to check the progress while executing
            if debug:
                print(line, data_flatten)
            # check if document subfolder doc\yyyy_mmm exists, if not create it
            if not os.path.exists(os.path.join(output_path, 'doc', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1])):
                os.makedirs(os.path.join(output_path, 'doc', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]))
            # check if metadata subfolder meta\yyyy_mmm exists, if not create it
            if not os.path.exists(os.path.join(output_path, 'meta', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1])):
                os.makedirs(os.path.join(output_path, 'meta', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]))
            # increment id (to link document and metadata folders/files)
            id += 1
            # creating doc csv file
            path = os.path.join(output_path, 'doc', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]) + '\\' + str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1] + '.csv'
            with open(path, mode='a', newline='', encoding='utf-8') as f1:
                csv_writer = csv.writer(f1)
                csv_writer.writerow([
                    id,
                    data_flatten['content'].replace('\n', ' ').replace('\r', ' ')
                ])
            # creating meta csv file
            path = os.path.join(output_path, 'meta', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]) + '\\' + str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1] + '.csv'
            with open(path, mode='a', newline='', encoding='utf-8') as f2:
                csv_writer = csv.writer(f2)
                csv_writer.writerow([
                    id,
                    file,
                    data_flatten['author_followers'],
                    data_flatten['author_full_name'],
                    data_flatten['author_id'],
                    data_flatten['author_image'],
                    data_flatten['author_name'],
                    data_flatten['author_url'],
                    data_flatten['date'],
                    data_flatten['date_from_provider'],
                    data_flatten['id'],
                    data_flatten['id_from_provider'],
                    data_flatten['image_url'],
                    data_flatten['link'],
                    data_flatten['location_latitude'],
                    data_flatten['location_longitude'],
                    data_flatten['place_country_code'],
                    data_flatten['place_name'],
                    data_flatten['place_street_address'],
                    data_flatten['provider'],
                    data_flatten['social_likes'],
                    data_flatten['social_replies']
                ])
                
print('id - ', str(id))

id -  1607742


# <span style="color:blue">Load data (files in folder influencers_2_de_2)</span>

### Paths (input and output)

In [6]:
# input data - path jsonl files
if test:
    input_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\influencers_test_2_de_2'
else:
    input_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\influencers_2_de_2'

# output data - path where the csv file is generated
if test:
    output_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test'
else:
    output_path = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins'

### Loading data process

In [7]:
# reading all jsonl files
files = [f for f in glob.glob(input_path + "**/*.jsonl", recursive=True)]

for f in files:
    with open(f, mode='r') as file:
        for line in file:
            # flatten json files
            data = json.loads(line)
            data_flatten = flatten(data)
            # debug line to check the progress while executing
            if debug:
                print(line, data_flatten)
            # check if document subfolder doc\yyyy_mmm exists, if not create it
            if not os.path.exists(os.path.join(output_path, 'doc', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1])):
                os.makedirs(os.path.join(output_path, 'doc', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]))
            # check if metadata subfolder meta\yyyy_mmm exists, if not create it
            if not os.path.exists(os.path.join(output_path, 'meta', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1])):
                os.makedirs(os.path.join(output_path, 'meta', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]))
            # increment id (to link document and metadata folders/files)
            id += 1
            # creating doc csv file
            path = os.path.join(output_path, 'doc', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]) + '\\' + str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1] + '.csv'
            with open(path, mode='a', newline='', encoding='utf-8') as f1:
                csv_writer = csv.writer(f1)
                csv_writer.writerow([
                    id,
                    data_flatten['content'].replace('\n', ' ').replace('\r', ' ')
                ])
            # creating meta csv file
            path = os.path.join(output_path, 'meta', str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1]) + '\\' + str((pd.to_datetime(data_flatten['date'])).year) + '_' + month_index_list[(pd.to_datetime(data_flatten['date'])).month - 1] + '.csv'
            with open(path, mode='a', newline='', encoding='utf-8') as f2:
                csv_writer = csv.writer(f2)
                csv_writer.writerow([
                    id,
                    file,
                    data_flatten['author_followers'],
                    data_flatten['author_full_name'],
                    data_flatten['author_id'],
                    data_flatten['author_image'],
                    data_flatten['author_name'],
                    data_flatten['author_url'],
                    data_flatten['date'],
                    data_flatten['date_from_provider'],
                    data_flatten['id'],
                    data_flatten['id_from_provider'],
                    data_flatten['image_url'],
                    data_flatten['link'],
                    data_flatten['location_latitude'],
                    data_flatten['location_longitude'],
                    data_flatten['place_country_code'],
                    data_flatten['place_name'],
                    data_flatten['place_street_address'],
                    data_flatten['provider'],
                    data_flatten['social_likes'],
                    data_flatten['social_replies']
                ])
                
print('id - ', str(id))

id -  3257290
