# Combine raw data files, per hour, into CSVs

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os

from dotenv import find_dotenv, load_dotenv

In [3]:
%aimport src.data.combine_data
import src.data.combine_data as cdh

In [4]:
load_dotenv(find_dotenv())

True

## About

Here, raw tweets data collected in every hour will be combined into a single CSV file and stored in a sub-folder in the S3 bucket at the path `datasets/twitter/kinesis-demo/csvs`. The formatting of the following datetime columns will be changed to the `yyyy-mm-dd HH:MM::SS` format
- `created_at`
  - datetime at which the tweet was posted on Twitter platform
- `user_joined`
  - datetime at which user joined Twitter platform

## User Inputs

In [5]:
# S3
s3_bucket_name = "sagemakertestwillz3s"
path_to_folder = "/datasets/twitter/kinesis-demo/"

headers = [
    "id",
    "geo",
    "coordinates",
    "place",
    "contributors",
    "is_quote_status",
    "quote_count",
    "reply_count",
    "retweet_count",
    "favorite_count",
    "favorited",
    "retweeted",
    "created_at",
    "source",
    "in_reply_to_user_id",
    "in_reply_to_screen_name",
    "source_text",
    "place_id",
    "place_url",
    "place_place_type",
    "place_name",
    "place_full_name",
    "place_country_code",
    "place_country",
    "place_bounding_box_type",
    "place_bounding_box_coordinates",
    "place_attributes",
    "coords_type",
    "coords_lon",
    "coords_lat",
    "geo_type",
    "geo_lon",
    "geo_lat",
    "user_name",
    "user_screen_name",
    "user_followers",
    "user_friends",
    "user_listed",
    "user_favourites",
    "user_statuses",
    "user_protected",
    "user_verified",
    "user_contributors_enabled",
    "user_joined",
    "user_location",
    "text",
]

In [6]:
aws_region = os.getenv("AWS_REGION")

## Combine raw data into CSV files

### Create `csvs/` S3 sub-folder in `<bucket-name>/datasets/twitter`

In [8]:
cdh.create_folder_in_s3_bucket(aws_region, s3_bucket_name, "csvs")

Found existing folder csvs in sagemakertestwillz3s. Did nothing.


### Combine files into single CSV file in `csvs/`

Get list of all CSV files in `csvs/`

In [10]:
existing_csv_files_list = cdh.get_existing_csv_files_list(
    s3_bucket_name, aws_region, path_to_folder[1:] + "csvs/tweets_"
)
existing_csv_files_list

['datasets/twitter/kinesis-demo/csvs/tweets_14_hc2021122702_s20211227195459.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_4_hc2021122703_s20211227195504.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_metadata_14_hc2021122702_s20211227195459.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_metadata_4_hc2021122703_s20211227195504.csv']

Combine raw data files, per hour, into single CSV file

In [12]:
%%time
dfs_all, dfs_metadata_all = [[], []]
# Loop over <bucket-name>/datasets/twitter/kinesis-demo
for content in cdh.get_objects_in_one_s3_level(
    s3_bucket_name, path_to_folder[1:], aws_region
).get("CommonPrefixes"):
    # Check if <bucket-name>/datasets/twitter/kinesis-demo/<year> exists
    if "csvs" not in cdh.get_objects_in_one_s3_level(
        s3_bucket_name, content, aws_region
    ).get("Prefix"):
        # Loop over <bucket-name>/datasets/twitter/kinesis-demo/<year>
        for content_year in cdh.get_objects_in_one_s3_level(
            s3_bucket_name, content, aws_region
        ).get("CommonPrefixes"):
            # Loop over <bucket-name>/datasets/twitter/kinesis-demo/<year>/<month>
            for content_month in cdh.get_objects_in_one_s3_level(
                s3_bucket_name, content_year, aws_region
            ).get("CommonPrefixes"):
                # Loop over <bucket-name>/datasets/twitter/kinesis-demo/<year>/<month>/<day>
                for content_day in cdh.get_objects_in_one_s3_level(
                    s3_bucket_name, content_month, aws_region
                ).get("CommonPrefixes"):
                    # Save tweets data and file metadata to
                    # <bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_*.csv and
                    # csvs/tweets_metadata_*.csv
                    cdh.save_data_and_metadata_to_s3_csv(
                        content_day.get("Prefix"),
                        existing_csv_files_list,
                        s3_bucket_name,
                        headers,
                        content_day,
                        path_to_folder[1:],
                        aws_region,
                    )

Found CSV file in datasets/twitter/kinesis-demo/csvs/tweets_14_hc2021122702_s20211227195459.csv. Did nothing.
Found CSV file in datasets/twitter/kinesis-demo/csvs/tweets_4_hc2021122703_s20211227195504.csv. Did nothing.
CPU times: user 208 ms, sys: 5.43 ms, total: 214 ms
Wall time: 1.07 s
