# Combine raw data files, per hour, into CSVs

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
from datetime import datetime
from zoneinfo import ZoneInfo

import boto3
import pandas as pd
from dotenv import find_dotenv, load_dotenv

In [3]:
%aimport src.data.combine_data
import src.data.combine_data as cdh

In [4]:
load_dotenv(find_dotenv())

True

In [5]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME")

## About

Here, raw tweets (twitter data) that was streamed per hour (using `twitter_s3.py`) will be combined (using `pandas.concat()`) into a single CSV file and stored in a sub-folder, in the same S3 bucket as the raw twitter data, at the path `<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs`. The formatting of the following datetime columns will be changed to the `yyyy-mm-dd HH:MM::SS` format (although this conversion should have been done in the twitter streaming script)
- `created_at`
  - datetime at which the tweet was posted on Twitter platform
- `user_joined`
  - datetime at which user joined Twitter platform

Also
- a subset of the tweet metadata will be selected for this version of the analysis. This subset (columns) will be exported to each CSV file
- hashtags and usernames were extracted from the tweet text and stored in separate fields in the raw data exported to S3, and they will not be used in the current analysis (though they can be combined with the text in future iterations of this project)

## User Inputs

In [6]:
# S3
path_to_folder = "/datasets/twitter/kinesis-demo/"
sub_folders_list = ["csvs", "predictions", "athena-queries-outputs", "models"]

# List of headers for all streamed twitter attributes
headers = [
    "id",
    "geo",
    "coordinates",
    "place",
    "contributors",
    "is_quote_status",
    "quote_count",
    "reply_count",
    "retweet_count",
    "favorite_count",
    "favorited",
    "retweeted",
    "created_at",
    "source",
    "in_reply_to_user_id",
    "in_reply_to_screen_name",
    "source_text",
    "place_id",
    "place_url",
    "place_place_type",
    "place_name",
    "place_full_name",
    "place_country_code",
    "place_country",
    "place_bounding_box_type",
    "place_bounding_box_coordinates",
    "place_attributes",
    "coords_type",
    "coords_lon",
    "coords_lat",
    "geo_type",
    "geo_lon",
    "geo_lat",
    "user_name",
    "user_screen_name",
    "user_followers",
    "user_friends",
    "user_listed",
    "user_favourites",
    "user_statuses",
    "user_protected",
    "user_verified",
    "user_contributors_enabled",
    "user_joined",
    "user_location",
    "retweeted_tweet",
    "tweet_text_urls",
    "tweet_text_hashtags",
    "tweet_text_usernames",
    "num_urls_in_tweet_text",
    "num_users_in_tweet_text",
    "num_hashtags_in_tweet_text",
    "text",
]

# List of twitter attributes to use in this version of the analysis
cols_to_use = [
    "id",
    "contributors",
    "created_at",
    "source",
    "in_reply_to_screen_name",
    "source_text",
    "place_id",
    "place_url",
    "place_place_type",
    "place_country_code",
    "place_country",
    "user_name",
    "user_screen_name",
    "user_followers",
    "user_friends",
    "user_listed",
    "user_favourites",
    "user_statuses",
    "user_protected",
    "user_verified",
    "user_joined",
    "user_location",
    "retweeted_tweet",
    "text",
]

# List of partial strings to use to filter out unwanted tweets
# - tweets containing sensitive tweet texts that were found retrospectively
#   and should be excluded from the CSV files
unwanted_partial_strings_list = [
    # specific to crypto mining
    "crypto",
    "token",
    "koistarter",
    "daostarter",
    "decentralized",
    "services",
    "pancakeswap",
    "eraxnft",
    "browsing",
    "kommunitas",
    "hosting",
    "internet",
    "exipofficial",
    "servers",
    "wallet",
    "liquidity",
    "rewards",
    "floki",
    "10000000000000linkstelegram",
    "dogecoin",
    "czbinance",
    "watch",
    "binance",
    "dogelonmars",
    "cryptocurrency",
    "hbomax",
    "money",
    "danheld",
    "cybersecurity",
    # others
    "prostitution",
    "nairobi",
    "musembe",
    "volcano detected",
    "block-2",
    "mo-greene",
    "running scared2012",
    "running scared 2012",
    "massacres",
    "eric ephriam chavez",
    "drugs",
    "tanzanite",
    "vvsorigin",
    "gemstonecarat",
    "bin laden",
    "saddam",
    "webuye",
    "bungoma",
    "perished",
    "popescu",
    "whore",
    "nasty",
    "ethereum",
    "pay someone",
    "gamejoin",
    "nft",
    "breeding",
    "seungkwan",
    "woozi",
    "hoshi",
    "bitcrush",
    "arcade",
    "homeworkpay",
    "homework",
    "photocards",
    "deta",
    "marketing",
    "dreamcast",
    "sega",
    "xbox",
    "wii",
    "ps4",
    "kasama",
    "nung",
    "lahat",
    "jinsoul",
    "brunisoul",
    "loona",
    "taas",
    "nung",
    "essay",
    # religious
    "scriptures",
    "methusealah",
    "testament",
    "yahweh",
    "god",
    "mullah",
    "allah",
    "clergy",
    "mercy",
    "morality",
    "muslims,",
    "hindus",
    "buddhist",
    "catholics",
    "christians",
    "atheist",
    # inappropriate
    "nazist",
    "antifa",
    "proud boys",
]

In [7]:
aws_region = os.getenv("AWS_REGION")

## Combine raw data into CSV files

### Create sub-folders

Create the following sub-folders at the same depth as the S3 folder where the Kinesis Firehose exports raw data by year (i.e. in `<bucket-name>/datasets/twitter`)
- `csvs`
- `predictions`
- `athena-queries-outputs`
- `models`

In [8]:
%%time
for sub_folder_name in sub_folders_list:
    cdh.create_folder_in_s3_bucket(aws_region, s3_bucket_name, sub_folder_name)

Found existing folder csvs in specified S3 bucket. Did nothing.
Found existing folder predictions in specified S3 bucket. Did nothing.
Found existing folder athena-queries-outputs in specified S3 bucket. Did nothing.
Found existing folder models in specified S3 bucket. Did nothing.
CPU times: user 168 ms, sys: 30.7 ms, total: 198 ms
Wall time: 784 ms


### Combine files into single CSV file in `csvs/`

Get list of all CSV files in `csvs/`

```python
# src/data/combine_data.py
def get_existing_csv_files_list(
    s3_bucket_name: str, region: str, prefix: str
) -> List[str]:
    """Get list of files in subfolder in S3 bucket, by filename prefix."""
    s3_resource = boto3.resource("s3", region_name=region)
    bucket = s3_resource.Bucket(s3_bucket_name)
    # Get list of objects containing user-specified prefix in filename
    files_found_objects_list = list(bucket.objects.filter(Prefix=prefix))
    # For each object, get dictionary of file attributes under .key attribute
    # and store these dictionaries in a list
    files_found_names_list = [w.key for w in files_found_objects_list]
    return files_found_names_list
```

In [9]:
existing_csv_files_list = cdh.get_existing_csv_files_list(
    s3_bucket_name, aws_region, path_to_folder[1:] + "csvs/tweets_"
)
existing_csv_files_list

['datasets/twitter/kinesis-demo/csvs/tweets_15_hc2021123017_s20220106193315.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_40_hc2021123118_s20220106193321.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_57_hc2022010416_s20220106195607.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2021123119_s20220106193331.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2021123122_s20220106193423.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010100_s20220106193454.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010104_s20220106193556.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010107_s20220106193641.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010109_s20220106193710.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010112_s20220106193752.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010116_s20220106193852.csv',
 'datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010120_s20220106193954.csv',
 'datasets/twitt

Using a custom Python module in `src/data/combine_data.py` (which will be discussed in the next section), combine raw data files, per hour, into single CSV file (using `pandas.concat()`). This module wlill perform the following steps
- get list of all files exported by AWS Kinesis Fireohse to the S3 bucket during a given hour
- read in all files in the S3 bucket during a given hour
  - over multiple hours, this creates a nested list where each inner list is also a nested list corresponding to tweets collected during 60 seconds (since the Kinesis firehose was programmed to dump data to the AWS S3 bucket after each 60 second period)
- use `pandas.concat()` to create a single `pandas.DataFrame` from the the inner nested list (multiple files each holding data dumped every 60 seconds)
- vertically concatenate all `pandas.DataFrame`s during a given hour into a single `DataFrame`
- filter out tweets with sensitive terms that were retrospectively discovered (when examining the results of topic modeling in the next notebook `4_data_processing.ipynb`)
- export `DataFrame` it to a single CSV file in the `csvs/` sub-folder in the same AWS S3 bucket

which will be repeated for every hour during which the Kinesis Firehose produced data files in the S3 bucket

In [None]:
%%time
# Loop over <bucket-name>/datasets/twitter/kinesis-demo
for content in cdh.get_objects_in_one_s3_level(
    s3_bucket_name, path_to_folder[1:], aws_region
).get("CommonPrefixes"):
    # Check if <bucket-name>/datasets/twitter/kinesis-demo/<year> exists
    if cdh.get_objects_in_one_s3_level(s3_bucket_name, content, aws_region).get(
        "Prefix"
    ).split("/", 3)[-1] not in [f"{f}/" for f in sub_folders_list]:
        if cdh.get_objects_in_one_s3_level(
            s3_bucket_name, content, aws_region
        ):
            # Loop over <bucket-name>/datasets/twitter/kinesis-demo/<year>
            for content_year in cdh.get_objects_in_one_s3_level(
                s3_bucket_name, content, aws_region
            ).get("CommonPrefixes"):
                # Loop over <bucket-name>/datasets/twitter/kinesis-demo/<year>/<month>
                for content_month in cdh.get_objects_in_one_s3_level(
                    s3_bucket_name, content_year, aws_region
                ).get("CommonPrefixes"):
                    utc_hour_counter = 0
                    # Loop over <bucket-name>/datasets/twitter/kinesis-demo/<year>/<month>/<day>
                    for content_day in cdh.get_objects_in_one_s3_level(
                        s3_bucket_name, content_month, aws_region
                    ).get("CommonPrefixes"):
                        # Get month and day
                        month, day = [
                            int(component) for component in content_month["Prefix"][:-1].split("/", 5)[-2:]
                        ]
                        # Get masks to filter raw data that will be combined
                        # # December
                        dec_mask = month in [12]
                        # # January, up to 01:00 UTC on the most recent day with streaming data saved to S3
                        curr_day_of_month_utc = int(datetime.now(tz=ZoneInfo("UTC")).strftime("%d"))
                        prev_days_of_month = list(range(1, (curr_day_of_month_utc - 1) + 1))
                        jan_prev_days_mask = month == 1 and day in prev_days_of_month
                        jan_curr_day_mask = month == 1 and day == curr_day_of_month_utc and utc_hour_counter <= 1
                        # Combine raw data into hourly CSVs
                        if dec_mask or jan_prev_days_mask or jan_curr_day_mask:
                            # Save tweets
                            # - data to <bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_*.csv
                            #   - this will be filtered to remove unwanted terms in the text of the tweet
                            # - metadata to <bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_metadata_*.csv
                            #   - this will not be filtered
                            cdh.save_data_and_metadata_to_s3_csv(
                                content_day.get("Prefix"),
                                existing_csv_files_list,
                                s3_bucket_name,
                                headers,
                                content_day,
                                path_to_folder[1:],
                                aws_region,
                                cols_to_use,
                                unwanted_partial_strings_list,
                                combine_hashtags_usernames=False,
                                aggregate_metadata=False,
                            )
                            # print(month, day, utc_hour_counter, [curr_day_of_month_utc, utc_hour_counter])
                            utc_hour_counter += 1

## Inspect data in the combined CSV files

### Get List of CSV files with data

In [10]:
files_list = [
    f"{s3_bucket_name}/{f}"
    for f in cdh.get_existing_csv_files_list(
        s3_bucket_name, aws_region, path_to_folder[1:] + "csvs/tweets_"
    )
    if "metadata" not in f
]

**Notes**
1. This will produce a list with the following entries
   ```python
   ['<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_15_hc2021123017_s20220102120517.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_15_hc2022010217_s20220102121623.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_40_hc2021123118_s20220102120522.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2021123119_s20220102120532.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2021123122_s20220102120615.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010100_s20220102120644.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010104_s20220102120745.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010107_s20220102120828.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010109_s20220102120856.csv',
    '<s3-bucket-name>/datasets/twitter/kinesis-demo/csvs/tweets_58_hc2022010112_s20220102120939.csv',
    .
    .
    .
    .
   ```

### View slice of the data

Show the following
- subset of columns for the first 10 rows (tweets) of data from the combined CSV files
- number of tweets in every raw file exported from the Firehose stream to each (hourly) folder in the S3 bucket

In [11]:
cols_to_show = [
    "created_at",
    "user_joined",
    "user_name",
    "user_screen_name",
    "text",
    "file_name",
]

In [12]:
%%time
df = pd.concat([pd.read_csv("s3://" + f, lineterminator="\n") for f in files_list[:3]], ignore_index=True)
print(f"Total number of tweets retrieved = {len(df):,}, Number of columns in the data = {df.shape[1]:,}")
with pd.option_context("display.max_colwidth", 2_000):
    display(df[cols_to_show].head(n=10))
with pd.option_context("display.max_colwidth", 2_000):
    display(
        df["file_name"].value_counts()
        .rename("num_tweets")
        .sort_index()
        .reset_index()
        .rename(columns={"index": "file_name"})
    )
list(df)

Total number of tweets retrieved = 11,100, Number of columns in the data = 25


Unnamed: 0,created_at,user_joined,user_name,user_screen_name,text,file_name
0,2021-12-30 17:35:53+00:00,2021-11-27 20:38:14+00:00,Happy New Years 2022,GuessWho122021,"Space colonists may turn to cannibalism, scientists warn:The first generations to occupy space outposts may be left with few nutrition choices, scholars believe",twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
1,2021-12-30 17:35:54+00:00,2018-09-04 17:18:46+00:00,TheToysRusKid,JC1of1,1. Beat Street2. Harlem Knights 3. National Lampoons Vacation 4. Bachelor Party 5. Do The right Thing6. How High7. Weird Science 8. Clerks 9. Beverly Hills Cop10. Goodfellas,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
2,2021-12-30 17:35:55+00:00,2016-05-25 10:54:46+00:00,BCABA Network,BcabaNetwork,NASA Plans Coverage of Webb Space Telescope Deployments,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
3,2021-12-30 17:35:55+00:00,2009-02-21 14:48:52+00:00,Jude Jackson 💙 Solidarity with #NHS workers,JudeJack,"Exciting times!Back in 1915, Einstein published his theory of General Relativity. 1917 the Bolsheviks led the worlds’ first proletarian revolution. Humanity is about to peer back to the beginning of time, and the world is poised once again for revolution.",twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
4,2021-12-30 17:35:55+00:00,2008-07-26 17:50:37+00:00,Alan Stern,AlanStern,"Obviously, since science is a mental mode of nature and Pluto doesn’t have a mind. But having it is the very essence of science and that requires functionally useful concepts like “planet”. Non-functional concepts hurt the progress of science and the public’s insights into it.",twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
5,2021-12-30 17:35:56+00:00,2016-09-12 07:35:14+00:00,Nhada Naim,kognomee_,Jupiter is in Penis and Vagina is stationed retrograde. Wear a condom ya’ll,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
6,2021-12-30 17:35:56+00:00,2008-12-23 19:34:02+00:00,Themistoclea (Bonnie DeVarco),Themistoclea,"How 'bout we squeeze in some good news before the year ends? Not only did deploy its tower assembly, the team also said the telescope's launch was so precise that we should expect the science to continue well beyond 10 years!",twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
7,2021-12-30 17:35:58+00:00,2009-07-14 05:10:36+00:00,Radio Justice 📻🎙⚖,justiceputnam,"NASA: It wasn't a strike, it was just a work slowdown.",twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
8,2021-12-30 17:35:58+00:00,2016-04-03 20:59:45+00:00,Kwaku Barcelona,o_theophilus1,"I was being interviewed by Elon Musk. He asked, ""where are you from"", and I said Portugal. He replied, ""so you are a fellow country man of a Pen merchant whose freekick ball broke my rover on Mars. Get out!"". Tears ran down my face. Shame on you Pendu for costing me my dream job",twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
9,2021-12-30 17:35:59+00:00,2017-11-03 12:41:31+00:00,The Buckshee,BucksheeForum,Webb telescope is captured soaring through space from Earth,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe


Unnamed: 0,file_name,num_tweets
0,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe,118
1,twitter_delivery_stream-1-2021-12-30-17-37-00-03050126-c5a3-457a-9141-44c723e9a16e,127
2,twitter_delivery_stream-1-2021-12-30-17-38-01-ee043884-55a1-4696-afe2-ae744d931748,107
3,twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4,164
4,twitter_delivery_stream-1-2021-12-30-17-40-03-9c7445d1-3438-4c6c-96df-4629d801bbd1,150
...,...,...
107,twitter_delivery_stream-1-2022-01-04-16-55-48-5b2ac7c5-1710-4a50-8886-be31f33778da,77
108,twitter_delivery_stream-1-2022-01-04-16-56-51-7f105f30-4f40-4093-a9e9-c4f0486b9896,89
109,twitter_delivery_stream-1-2022-01-04-16-57-52-98571cc0-a199-43ea-ad2e-bdac6c574c33,88
110,twitter_delivery_stream-1-2022-01-04-16-58-54-be868279-f0eb-4a50-8223-cdb34159aad6,99


CPU times: user 259 ms, sys: 47.5 ms, total: 307 ms
Wall time: 1.2 s


['id',
 'contributors',
 'created_at',
 'source',
 'in_reply_to_screen_name',
 'source_text',
 'place_id',
 'place_url',
 'place_place_type',
 'place_country_code',
 'place_country',
 'user_name',
 'user_screen_name',
 'user_followers',
 'user_friends',
 'user_listed',
 'user_favourites',
 'user_statuses',
 'user_protected',
 'user_verified',
 'user_joined',
 'user_location',
 'retweeted_tweet',
 'text',
 'file_name']

Show subset of columns for retweets

In [13]:
with pd.option_context("display.max_colwidth", 2_000):
    display(df.query("retweeted_tweet == 'yes'")[cols_to_show])

Unnamed: 0,created_at,user_joined,user_name,user_screen_name,text,file_name
478,2021-12-30 17:39:46+00:00,2014-12-21 22:37:43+00:00,Mae✨,Midnigth_City,if you were born while Pluto was a planet,twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4
780,2021-12-30 17:41:56+00:00,2021-11-07 00:41:45+00:00,problematiccontentenjoyer,probsaproblem1,if you were born while Pluto was a planet,twitter_delivery_stream-1-2021-12-30-17-41-03-b8becce2-f393-42a9-bc9c-f8847fb4a7d2
1516,2021-12-30 17:48:03+00:00,2012-02-11 20:29:25+00:00,pintobean🗝,smdeangelo18,if you were born while Pluto was a planet,twitter_delivery_stream-1-2021-12-30-17-47-10-e7c45fc0-d1a2-4843-beed-2489ad22b82f
1631,2021-12-30 17:49:01+00:00,2020-10-24 08:57:41+00:00,demjin,fuduhhhhhhhhh,if you were born while Pluto was a planet,twitter_delivery_stream-1-2021-12-30-17-48-10-31a1f4b1-30ef-4123-93f5-0b377dafcf2e
4828,2021-12-31 18:59:25+00:00,2021-07-17 11:47:11+00:00,josh,an0nym0us_eskeh,if you were born while Pluto was a planet,twitter_delivery_stream-1-2021-12-31-18-59-21-04ddb807-dfc3-4889-998c-2ffcb0ff10d6
4879,2021-12-31 19:00:05+00:00,2020-05-07 10:32:01+00:00,HAKOM TimeSeries,HakomTimeSeries,RT:The Dark Matter of Common Sense Is Not So Common v/,twitter_delivery_stream-1-2021-12-31-18-59-21-04ddb807-dfc3-4889-998c-2ffcb0ff10d6
7896,2022-01-04 16:28:01+00:00,2010-01-24 19:15:10+00:00,Nadeem Ansari FCIM F IDM,nadeemansary,"RT Well, this is amazing! It's an Actual Image of China’s Tianwen-1 Orbiting Mars captured by a smaller satellite that the orbiter released while orbiting the planet.",twitter_delivery_stream-1-2022-01-04-16-27-24-37dbe71d-2d30-4dce-9027-57f9a3f4a385
7966,2022-01-04 16:28:36+00:00,2014-09-09 23:17:37+00:00,Tennessee Wing Civil Air Patrol,TennesseeCAP,"RT CivilAirPatrol ""Cadets from our will be asking questions to astronauts currently in space. Tune in soon at",twitter_delivery_stream-1-2022-01-04-16-28-25-a43e3da5-1145-4e44-a59b-90018969f41c
8030,2022-01-04 16:29:14+00:00,2012-02-23 01:42:23+00:00,Eric V. Scott,ericvscott5,"RT CivilAirPatrol ""Cadets from our will be asking questions to astronauts currently in space. Tune in soon at",twitter_delivery_stream-1-2022-01-04-16-28-25-a43e3da5-1145-4e44-a59b-90018969f41c
9048,2022-01-04 16:38:48+00:00,2020-03-08 22:09:46+00:00,📔Lee the Kobold Librarian📔,Kobold_Servant,if you were born while Pluto was a planet,twitter_delivery_stream-1-2022-01-04-16-38-33-66ba9099-1f4f-441c-a25e-02882e53a63b


## Inspect raw data in the S3 bucket

At the end of the previous section, all raw data files collected during an hour were combined into a single CSV file. This was done using a custom Python module, as explained above.

In this section, we will briefly explore the raw data exported to the S3 bucket from the Firehose stream. We'll now show how each raw data file (each streamed Twitter record) can be converted into a `DataFrame` where each row corresponds to a single streamed record of Twitter data. The custom module used above handles this, but here we will briefly explore the first few processing steps that are encapsulated into that module.

Define the `boto3` S3 client

In [14]:
s3_client = boto3.client("s3", region_name=aws_region)

Use `boto3` to list files in the S3 bucket for one hour of tweets streamed using Kinesis Firehose and TweePy
- we'll focus on files that were exported to an S3 folder during the 19:00 hour (UTC) on a single date (2021-12-31)

In [15]:
response_new = s3_client.list_objects_v2(
    Bucket=s3_bucket_name,
    Prefix=path_to_folder[1:] + "2021/12/31/19/",
    Delimiter="/",
)
with pd.option_context("display.max_colwidth", 2_000):
    display(
        pd.DataFrame.from_records(response_new["Contents"]).replace(
            {"ETag": '"'}, "", regex=True
        )
    )

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass
0,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-00-23-a2b41515-424e-44d7-b146-1d7ec4f87e47,2021-12-31 19:01:26+00:00,c64f8fa442b7f5d4915852806a221f9d,45440,STANDARD
1,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-01-24-0db1f3e6-292e-4ca1-ad48-f67c1272ce1e,2021-12-31 19:02:27+00:00,4e2023f345c02c806459c947e7914b24,48326,STANDARD
2,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-02-25-489245e0-c418-4d07-927f-2c5f258fa2d6,2021-12-31 19:03:28+00:00,6fcbc2acce8a6bbd411cbcaaf423e98e,49540,STANDARD
3,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-03-29-1e4e4d1e-fde2-45de-beb0-504a02189e0f,2021-12-31 19:04:31+00:00,f60e91bbd5c152197be5e25ef85a00b1,46082,STANDARD
4,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-04-30-2e72131b-96ed-448e-ac15-dcda3313e671,2021-12-31 19:05:33+00:00,4515363c49f4b8b9bc53cbda40dee9ba,53066,STANDARD
5,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-05-32-b265214c-8a04-4d4b-910a-418bb129ed58,2021-12-31 19:06:35+00:00,72e28226158634f1496b24f778de961d,42484,STANDARD
6,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-06-34-c4287ed5-121d-47a2-b085-2efa05c95f31,2021-12-31 19:07:37+00:00,de9c3c09e1bfac9e2bfb81aede7fd2af,47380,STANDARD
7,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-07-36-58902263-5fbf-4ff0-b9ec-27fe3765c39a,2021-12-31 19:08:38+00:00,ee0c021f3a7065fb6b0e20144de8565e,44098,STANDARD
8,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-08-37-c54a59d5-5fb1-4208-ba0b-4ae0d162ca7d,2021-12-31 19:09:39+00:00,d76281f28a049cb7b5d6e0c29b664614,39662,STANDARD
9,datasets/twitter/kinesis-demo/2021/12/31/19/twitter_delivery_stream-1-2021-12-31-19-09-39-748ac6f5-9b72-4249-bdc7-124acb69065f,2021-12-31 19:10:41+00:00,e26d005d8280faae9b58ed74d7fc3a99,50850,STANDARD


**Observations**
1. Each row corresponds to a file of twitter data streamed during a 60-second period (per the configuration of the AWS Kinesis Firehose). There are 58 such files.
2. There are multiple twitter (tweet) records in every file. At this point, it is not possible to determine how many records are stored in every file but that will be determined next using `boto3`.

Get the first file collected during the chosen hour (19:00 UTC) in `path_to_folder`

In [None]:
file = response_new["Contents"][0]
file

Get the file body and file name from the file object

In [17]:
file_body = s3_client.get_object(Bucket=s3_bucket_name, Key=file.get("Key"))[
    "Body"
].read()
file_name = os.path.basename(file["Key"])

Get the file name

In [None]:
print(file_name)

From the file body, get the number of tweets in this file

In [19]:
print(len(file_body.decode("utf-8").split("\n")))

86


**Observations**
1. The last row in the file is a new line (blank) in the file. This blank line is present since the `twitter_s3.py` streaming script writes every record that ends in a newline character (`\n`). When this read into a `DataFrame` using pandas `read_csv` earlier, we could (optionally) specify a line terminator to be `\n`, meaning that the last line of the raw data file does **not** get accidently picked up as a blank row in that `DataFrame`. However, here, we are working with the raw file body where we split on the newline object, but this will ignore the last newline character (the one at the end of the last streamed tweet). Since the last `\n` is not used to split any lines, it is considered as a line on its own. This will manifest itself as a blank line (appearing as a blank string) at the end of **every** file.

   This is illustrated in a simple example below

   ```python
   single_tweet_record = "1\n2\n3\n"
   print(len(single_tweet_record.split("\n")), single_tweet_record.split("\n"))
   ```
   which gives the following output
   ```python
   4 ['1', '2', '3', '']
   ```
   
   where we are expecting three fields (which we get), but we also get a blank string due to the leftover newline character that cannot split any more lines (see [here](https://stackoverflow.com/a/34844548/4057186) for further discussion). This will be verified below for the data in S3.

First, show the full first tweet (and all user attributes) that were saved

In [20]:
first_record = file_body.decode("utf-8").split("\n")[0]
print(first_record.strip().split("\t"))

['1476991622084710403', 'None', 'None', 'None', 'None', 'False', '0', '0', '0', '0', 'False', 'False', 'Fri Dec 31 19:00:18 +0000 2021', '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'None', 'None', 'Twitter for Android', '', '', '', '', '', '', '', '', '[[]]', '{}', '', '', '', '', '', '', 'Pezhman', 'Pezhman921', '0', '1', '0', '101', '87', 'False', 'False', 'False', 'Fri Dec 24 17:12:22 +0000 2021', 'None', 'no', 'https://t.co/tZQX952m3C|https://t.co/MQFNdH7JdL', 'space|space|cryptocurrency|Helios', 'OKEx|SidusSpacewith|NASAand|MissionHelios', '2', '4', '4', " We are the First decentralized  community. The world's biggest media reported the collaboration of ____   We are Bridge between  and  .  $Helios"]


**Observations**
1. We can see that every element of this list is a string, as expected since we converted all streamed data (in `twitter_s3.py`) to strings before exporting each tweet *record* to S3 using Firehose.
2. As mentioned above, since we are splitting this list of lists using a newline character, every record except the last one does not end in a newline charater (`\n`). Here, since we are showing the first such record and not the last one, we can see that the record does not end in a newline character.

Next, we will verify that, for every file, the last retrieved record is a blank string (resulting from the leftover newline character discussed above)

In [21]:
%%time
for file in response_new["Contents"]:
    file_body = s3_client.get_object(Bucket=s3_bucket_name, Key=file.get("Key"))[
        "Body"
    ].read()
    nested_list_of_records = file_body.decode("utf-8").split("\n")
    last_record = nested_list_of_records[-1]
    assert last_record == ''

CPU times: user 254 ms, sys: 2.99 ms, total: 257 ms
Wall time: 3.76 s


Next, the first three streamed Twitter records are shown below

In [22]:
nested_list_of_records[:3]

['1477006376647942156\tNone\tNone\tNone\tNone\tFalse\t0\t0\t0\t0\tFalse\tFalse\tFri Dec 31 19:58:56 +0000 2021\t<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>\tNone\tNone\tTwitter for iPhone\t\t\t\t\t\t\t\t\t[[]]\t{}\t\t\t\t\t\t\tAndrew Parris\tAndrewParris17\t111\t74\t1\t2986\t2262\tFalse\tFalse\tFalse\tSun Jun 02 20:24:34 +0000 2019\tHouston, TX\tno\thttps://t.co/fyF7CuEM6S\t\t\t1\t0\t0\tOne hell of a year 2021, the best year of my life. When I started the year I had no idea what was in store, and I definitely didn’t know I’d become an astronaut. This year I met the most incredible people, had experiences I never imagined possible, grew, and became even more me\t',
 '1477006377855901701\tNone\tNone\tNone\tNone\tFalse\t0\t0\t0\t0\tFalse\tFalse\tFri Dec 31 19:58:56 +0000 2021\t<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>\tNone\tNone\tTwitter for Android\t\t\t\t\t\t\t\t\t[[]]\t{}\t\t\t\t\t\t\tJoe Benigno\tjoe_be

We are now ready to split each record by a tab separator (`\t`) and the resulting list will correspond to the fields (columns) for a the first three streamed records in our overall `DataFrame`, as shown below

In [23]:
with pd.option_context("display.max_columns", 100):
    display(
        pd.DataFrame(
            [record.split("\t")[:-1] for record in nested_list_of_records[:3]],
            columns=headers,
        )
    )

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text
0,1477006376647942156,,,,,False,0,0,0,0,False,False,Fri Dec 31 19:58:56 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Andrew Parris,AndrewParris17,111,74,1,2986,2262,False,False,False,Sun Jun 02 20:24:34 +0000 2019,"Houston, TX",no,https://t.co/fyF7CuEM6S,,,1,0,0,"One hell of a year 2021, the best year of my l..."
1,1477006377855901701,,,,,False,0,0,0,0,False,False,Fri Dec 31 19:58:56 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Joe Benigno,joe_benigno5,52,263,3,36748,30095,False,False,False,Sat Feb 13 19:39:10 +0000 2016,"Easthampton, MA",no,https://t.co/JeJRIY34vD,,,1,0,0,97 years ago today Edwin Hubble announced the ...
2,1477006381802688512,,,,,False,0,0,0,0,False,False,Fri Dec 31 19:58:57 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",1.463635603065487e+18,NationNormie3,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Brandon Fortino,BLFortino,24,124,2,4027,1092,False,False,False,Tue Mar 13 20:05:30 +0000 2012,,no,,,NationNormie3|SciGuySpace,0,2,0,The Russian core segment is structurally fai...


**Notes**
1. Similar to splitting records by `\n`, which results in a leftover `\n` at the end of the file, here we will have a leftover `\t`. For this reason, we slice the resulting list to exclude the last list item (the leftover `\t` character).

As mentioned at the end of the previous section, the functionality to create this `DataFrame`, for every file, is encapsulated in a standalone Python module in `src/data/combine_data.py`. In that module, additional functionality is included to extract metadata about the files from the S3 bucket. This module was used at the end of the previous section to create such a `DataFrame`, for every hour's worth of streamed Twitter data, and export that `DataFrame` to a single CSV file to a subfolder in the S3 bucket at `datasets/twitter/kinesis-demo/csvs/` (giving one CSV file per hour).

The contents of this custom Python module are shown below

```python
# src/data/combine_data.py
import os
from datetime import datetime
from io import StringIO
from typing import Dict, List, Union

import boto3
import pandas as pd


def get_objects_in_one_s3_level(
    s3b_name: str, content: Union[Dict, str], region: str
) -> Dict:
    """Get list of all storage objects in single S3 level."""
    s3_client = boto3.client("s3", region_name=region)
    # Get path to hourly sub-folders within each daily folder on S3 bucket
    prefix = content if isinstance(content, str) else content.get("Prefix")
    # Get list of all objects in all hourly sub-folders
    # - each list of is a list of dictionaries, where each dict contains keys:
    #   - Key, LastModified, ETag, Size, StorageClass
    response_new = s3_client.list_objects_v2(
        Bucket=s3b_name, Prefix=prefix, Delimiter="/"
    )
    return response_new


def get_data_metadata(file: str, s3_bucket_name: str, region: str) -> Dict:
    """Extract data and file metadata from raw tweets data."""
    s3_client = boto3.client("s3", region_name=region)
    # Get File body (decoded contents) from file dictionary
    file_body = s3_client.get_object(
        Bucket=s3_bucket_name, Key=file.get("Key")
    )["Body"].read()
    # Get File name from file dictionary
    file_name = os.path.basename(file["Key"])
    return {"file_body": file_body, "file_name": file_name}


def get_attrs_extracted_from_tweet_text(
    row: pd.Series, attr_type: str = "hashtags"
) -> str:
    """Get attrs (hashtags or usernames) extracted from tweet text."""
    # Get extracted attribute (tweet_text_hashtags or tweet_text_usernames)
    # from each tweet (row of a pandas DataFrame)
    # - attributes will be the '|' separated string
    extracted = str(row[f"tweet_text_{attr_type}"])
    # Split the string by the pipe operator ('|') to give a single string of
    # space-separated attributes
    extracted_separated = (
        " " + extracted.replace("|", " ") if str(extracted) != "nan" else ""
    )
    # print(
    #     row.name,
    #     type(extracted_separated),
    #     extracted_separated,
    #     f"extracted_{attr_type}={extracted_separated}",
    # )
    return extracted_separated


def get_datetime_string() -> str:
    """Generate current timestamp as string."""
    return datetime.now().strftime("%Y%m%d%H%M%S")


def get_hourly_data_metadata(
    data_list: List,
    headers: List,
    fpath: str,
    cols_to_use: List[str],
    get_metadata_agg: bool = False,
) -> List[pd.DataFrame]:
    """Load raw tweets data and file metadata into DataFrames."""
    year, month, day, hour = fpath.split("/", 3)[-1].split("/", 3)
    dfs = []
    dfs_metadata = []
    # Loop over list of dictionaries, where each dict corresponds to a
    # separate file and contains keys: file_name, file_body (file contents)
    for k, raw_data_contents in enumerate(data_list):
        # Decode file contents and split by \n giving nested list
        # - each sub-list is a single tweet and its metadata
        single_buffer_data_strings = (
            raw_data_contents["file_body"].decode("utf-8").split("\n")
        )
        # Iterate over nested list
        all_buffer_contents = []
        for q, data_string in enumerate(single_buffer_data_strings):
            if data_string:
                # split each sub-list by \t in order to get values for each
                # field
                values = data_string.strip().split("\t")
                # print(
                #     k+1,
                #     q+1,
                #     len(raw_data_contents["file_body"]),
                #     len(values),
                #     len(values) != len(headers),
                #     data_string,
                # )
                # Append tweet metadata to dict
                dfs_metadata.append(
                    {
                        "file": k + 1,
                        "file_name": raw_data_contents["file_name"],
                        "encoded_length": len(raw_data_contents["file_body"]),
                        "values_index": q + 1,
                        "len_values": len(values),
                        "malformed_values": len(values) != len(headers),
                        "file_year": year,
                        "file_month": month,
                        "file_day": day,
                        "file_hour": hour[:-1],
                    }
                )
                # Append tweet data to dict (if data is not malformed with
                # more fields than expected)
                if len(values) == len(headers):
                    all_buffer_contents.append(values)
        # Convert nested list of tweet data into DataFrame and append raw data
        # filename as separate column
        df_row = pd.DataFrame(all_buffer_contents, columns=headers).assign(
            file_name=raw_data_contents["file_name"]
        )
        # Append DataFrame of data to empty list
        dfs.append(df_row)
    # Vertically concatenate list of DFs of data in order to get single DF of
    # tweets retrieved per hour
    df = pd.concat(dfs, ignore_index=True)
    # Combine tweet text with space-separated hashtags and usernames that
    # were extracted from the text of the tweet
    # - eg. 'tweet text goes here' will be combined with
    #       (hashtags string) 'hashtag1 hashtag2 hashtag3' and
    #       (user names string) 'username1 username2 username3'
    for attr_type in ["hashtags", "usernames"]:
        df[f"{attr_type}_str"] = df.apply(
            get_attrs_extracted_from_tweet_text, axis=1, attr_type=attr_type
        )
    df["text"] = df["text"] + df["hashtags_str"] + df["usernames_str"]
    # Slice vertically concatenated data to select required columns
    all_cols_to_use = cols_to_use + ["file_name"]
    df = df[all_cols_to_use].dropna()
    # Vertically concatenate list of DFs of metadata in order to get single DF
    # of tweets metadata per hour
    df_metadata = pd.DataFrame.from_records(dfs_metadata)
    # (optional) Aggregate metadata by raw data file name
    if get_metadata_agg:
        df_metadata_agg = (
            df_metadata.groupby(["file_name"], as_index=False)
            .agg(
                {
                    "encoded_length": "min",
                    "values_index": "max",
                    "len_values": "min",
                    "malformed_values": "sum",
                }
            )
            .assign(num_valid_records=len(df))
        )
    else:
        # if no aggregation wanted, return empty DataFrame
        df_metadata_agg = pd.DataFrame()
    return [df, df_metadata, df_metadata_agg]


def create_folder_in_s3_bucket(
    region: str, s3_bucket_name: str, folder_name: str = "csvs"
) -> None:
    """Create folder in S3 bucket, if it does not exist."""
    s3_client = boto3.client("s3", region_name=region)
    # List all objects in S3 bucket that are inside the CSVs/ sub-folder
    folders_response_result = s3_client.list_objects_v2(
        Bucket=s3_bucket_name,
        Prefix="datasets/twitter/kinesis-demo/csvs",
        Delimiter="/",
    )
    # Create object (with no body), which will result in an empty folder
    # (if a folder of the same name is not already present in the CSVs/
    # sub-folder)
    if "CommonPrefixes" in folders_response_result:
        print(
            f"Found existing folder {folder_name} in specified S3 bucket. "
            "Did nothing."
        )
    else:
        proc_data_folder_creation_response = s3_client.put_object(
            Bucket=s3_bucket_name,
            Body="",
            Key="datasets/twitter/kinesis-demo/csvs/",
        )
        print(f"Created folder csvs in {s3_bucket_name}.")


def get_existing_csv_files_list(
    s3_bucket_name: str, region: str, prefix: str
) -> List[str]:
    """Get list of files in subfolder in S3 bucket, by filename prefix."""
    s3_resource = boto3.resource("s3", region_name=region)
    bucket = s3_resource.Bucket(s3_bucket_name)
    # Get list of objects containing user-specified prefix in filename
    files_found_objects_list = list(bucket.objects.filter(Prefix=prefix))
    # For each object, get dictionary of file attributes under .key attribute
    # and store these dictionaries in a list
    files_found_names_list = [w.key for w in files_found_objects_list]
    return files_found_names_list


def save_df_to_csv_on_s3(
    df: pd.DataFrame,
    bucket_name: str,
    filepath: str,
    region: str,
    df_type: str = "metadata",
) -> None:
    """Export DataFrame as CSV to folder in S3 bucket."""
    s3_client = boto3.client("s3", region_name=region)
    # Prepare DataFrame for export to an S3 bucket
    # - https://stackoverflow.com/a/58636316/4057186
    csv_buf = StringIO()
    df.to_csv(csv_buf, header=True, index=False)
    csv_buf.seek(0)
    # Add CSV to bucket under the specified filepath (in this case, under
    # the CSVs/ sub-folder)
    s3_client.put_object(
        Bucket=bucket_name, Body=csv_buf.getvalue(), Key=filepath
    )
    print(f"- Copied {len(df):,} rows of {df_type} to bucket at {filepath}.")


def save_data_and_metadata_to_s3_csv(
    subfolder_path: str,
    existing_csv_files: List[str],
    s3_bucket_name: str,
    headers: List[str],
    content: Dict,
    path_to_csvs_folder: str,
    region: str,
    cols_to_use: List[str],
    aggregate_metadata: bool = False,
) -> None:
    """Extract tweets data and metadata and export to csvs/ in S3 bucket."""
    # Concatenate year, month, day and hour specified by subfolder_path
    # - 'datasets/twitter/kinesis-demo/2021/12/30/17/' becomes 'hc2021123017'
    ctime_str = "hc" + subfolder_path.split("/", 3)[-1].rstrip("/").replace(
        "/", ""
    )
    # Get list of hourly CSVs of data and metadata that already exist in
    # csvs/ sub-folder in S3 bucket
    existing_matching_csv_files = [
        f for f in existing_csv_files if ctime_str in f
    ]
    # Get string with current datetime
    ftime_str = "s" + get_datetime_string()
    # For the given subfolder path, if no hourly CSVs of data and metadata
    # exists in S3 bucket under the CSVs/ sub-folder, then extract these as
    # pandas DFs and export each
    # - i.e. if the above list (existing_matching_csv_files) is not empty,
    #   then export data and metadata, else do nothing
    if not existing_matching_csv_files:
        # Get list of dictionaries (with file name and file body) in S3 bucket
        data_list = [
            get_data_metadata(file_name, s3_bucket_name, region)
            for file_name in get_objects_in_one_s3_level(
                s3_bucket_name,
                content,
                region,
            )["Contents"]
        ]
        # Get DFs of tweets data and metadata from file attributes in list of
        # dicts
        df, df_metadata, _ = get_hourly_data_metadata(
            data_list,
            headers,
            content.get("Prefix"),
            cols_to_use,
            aggregate_metadata,
        )
        # Change datetime format in DF of data
        for c in ["created_at", "user_joined"]:
            df[c] = pd.to_datetime(df[c])
        # Save DFs of data and metadata as CSVs to S3 bucket under the CSVs/
        # sub-folder, if a CSV does not already exist for that hour
        for df_obj, suffix, file_type in zip(
            [df, df_metadata],
            ["tweets_", "tweets_metadata_"],
            ["data", "metadata"],
        ):
            fpath = (
                f"{path_to_csvs_folder}csvs/{suffix}"
                f"{len(data_list)}_{ctime_str}_{ftime_str}.csv"
            )
            print(f"Did not find {file_type} CSV file in {fpath}. ")
            save_df_to_csv_on_s3(
                df_obj, s3_bucket_name, fpath, region, file_type
            )
    else:
        # Since hourly CSV exists in CSVs/ sub-folder in S3 bucket, do nothing
        fpath = existing_matching_csv_files[0]
        print(f"Found CSV file in {fpath}. Did nothing.")
```