In [1]:
from pathlib import Path

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from ams.config import constants
from ams.services import file_services

Setting up logging...
Will use logging path: C:\Users\Chris\workspaces\data\logs\alpha_media_signal


In [2]:
input_path = Path(constants.TWITTER_OUTPUT_RAW_PATH, "deduped", "main")
files = file_services.list_files(str(input_path), ends_with=".parquet", use_dir_recursion=True)

In [3]:
%%time
import pandas as pd
import time
import dask
import dask.dataframe as dd
from dask.dataframe import from_pandas

dask.config.set(scheduler='processes')

all_ddf = []
for f in files:
    pdf = pd.read_parquet(f)
    print("Converting Pandas dataframe to Dask DF ...")
    ddf = from_pandas(pdf, npartitions=3)
    all_ddf.append(ddf)

print("Concatenating Dask DFs ...")
ddf = dd.concat(all_ddf,axis=0)

ddf.columns

Converting Pandas dataframe to Dask DF ...
Converting Pandas dataframe to Dask DF ...
Converting Pandas dataframe to Dask DF ...
Converting Pandas dataframe to Dask DF ...
Concatenating Dask DFs ...
Wall time: 1min 33s


Index(['truncated', 'user_translator_type', 'user_profile_link_color',
       'user_time_zone', 'user_profile_banner_url',
       'user_default_profile_image', 'is_quote_status', 'user_url', 'lang',
       'user_description', 'retweet_count', 'user_created_at',
       'user_following', 'user_geo_enabled', 'id', 'user_contributors_enabled',
       'user_id', 'user_profile_background_image_url',
       'user_profile_background_color', 'entities_urls_1',
       'entities_user_mentions_3', 'user_favourites_count', 'user_location',
       'user_profile_sidebar_fill_color', 'user_utc_offset',
       'user_notifications', 'user_lang', 'user_listed_count',
       'user_default_profile', 'user_profile_image_url_https',
       'user_profile_sidebar_border_color', 'possibly_sensitive',
       'user_is_translation_enabled', 'user_profile_background_tile',
       'user_protected', 'entities_urls_0', 'entities_user_mentions_1',
       'entities_urls_2', 'in_reply_to_status_id', 'user_statuses_count'

In [4]:
from typing import List
analyzer = SentimentIntensityAnalyzer()

def add_senti(text) -> List[str]:
    result = analyzer.polarity_scores(text)
    return [result["neg"], result["neu"], result["pos"], result["compound"]]

ddf = ddf.assign(sent_list = ddf.nlp_text.map(lambda x: add_senti(x)))
ddf = ddf.assign(f22_sentiment_neg = ddf.sent_list.map(lambda x: x[0]))
ddf = ddf.assign(f22_sentiment_neu = ddf.sent_list.map(lambda x: x[1]))
ddf = ddf.assign(f22_sentiment_pos = ddf.sent_list.map(lambda x: x[2]))
ddf = ddf.assign(f22_sentiment_compound = ddf.sent_list.map(lambda x: x[-1]))
ddf.drop("sent_list", axis=1)

start = time.time()
ddf.compute()
end = time.time()
print("Elapsed: " + str(end - start))

Elapsed: 938.7162492275238


In [6]:
parent = Path(constants.TWITTER_OUTPUT_RAW_PATH, 'sent_drop', "staging")
sent_drop_path = file_services.create_unique_folder_name(parent, prefix="sd")

ddf.to_parquet(path=str(sent_drop_path), engine="pyarrow", compression="snappy")