# Extract Sentiment from Filtered Tweets by Hour

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [8]:
from datetime import datetime
from glob import glob
from typing import List

import pandas as pd

## About

Extract sentiment from text in tweet using pre-trained transformer models.

## User Inputs

In [3]:
raw_data_folder = "data/processed"

In [11]:
files_list = [glob(f"{raw_data_folder}/*.parquet.gzip")[0]]

In [9]:
def remove_leading_whitespace(df: pd.DataFrame) -> pd.DataFrame:
    """Drop tweets with duplicated text."""
    df["text"] = df["text"].str.lstrip()
    return df


def drop_duplicate_tweets(
    df: pd.DataFrame, subset: List[str] = ["text"]
) -> pd.DataFrame:
    """Drop tweets with duplicated text."""
    df_no_dups = df.drop_duplicates(subset=subset)
    num_rows_dropped = len(df) - len(df_no_dups)
    print(f"Dropped {num_rows_dropped:,} duplicated tweets from raw data")
    return df_no_dups

## Extract Sentiment from Hourly Filtered Data

In [13]:
%%time
for q, f in enumerate(files_list):
    start = datetime.now()
    print(
        "Extracting Sentiment from Tweets - Starting time = "
        f"{start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    df_raw = pd.read_parquet(f, columns=['text'])
    df = (
        df_raw.pipe(remove_leading_whitespace)
        .pipe(drop_duplicate_tweets, subset=['text'])
    )
    end = datetime.now()
    duration = (end - start).total_seconds()
    print(
        "Done extracting sentiment at "
        f"{end.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} ({duration:.3f} seconds)."
    )
    if q < len(files_list)-1:
        print()
df

Extracting Sentiment from Tweets - Starting time = 2022-08-13 17:02:31.859...
Dropped 813 duplicated tweets from raw data
Done extracting sentiment at 2022-08-13 17:02:31.866 (0.007 seconds).
CPU times: user 7.99 ms, sys: 0 ns, total: 7.99 ms
Wall time: 6.67 ms


Unnamed: 0,text
0,Huge day ahead for the NASA/ESA/CSA with the ...
1,"With today's clearing skies, the NASA MODIS sa..."
3,"By the way, the clips I’m posting each day of ..."
4,"Hubble had quite a year, but perhaps one its b..."
5,OH MY GOODNESS!!It's there!A semi clear sky to...
...,...
1127,"""Our goal is to create a safe &amp; transparen..."
1130,"It went up, cause if she has to run, there is ..."
1131,This has been my year in astronomy and In 20...
1134,"Come on man, Arecibo was long defunct and they..."
