In [74]:
from pathlib import Path
from typing import Dict

import findspark
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from ams.config import constants
from ams.services import file_services
from ams.services import spark_service

In [75]:
input_path = Path(constants.TWITTER_OUTPUT_RAW_PATH, "deduped", "main")
files = file_services.list_files(str(input_path), ends_with=".parquet", use_dir_recursion=True)

In [92]:
%%time
from pyspark.sql import functions as F
import gc
import pandas as pd
import time
import dask
import dask.dataframe as dd
from dask.dataframe import from_pandas

dask.config.set(scheduler='processes')

all_ddf = []
for f in files:
    pdf = pd.read_parquet(f)
    print("Converting Pandas dataframe to Dask DF ...")
    ddf = from_pandas(pdf, npartitions=3)
    all_ddf.append(ddf)

print("Concatenating Dask DFs ...")
ddf = dd.concat(all_ddf,axis=0)

ddf.columns

Converting Pandas dataframe to Dask DF ...
Converting Pandas dataframe to Dask DF ...
Converting Pandas dataframe to Dask DF ...


MemoryError: 

In [89]:
from typing import List
analyzer = SentimentIntensityAnalyzer()

def add_senti(text) -> List[str]:
    result = analyzer.polarity_scores(text)
    return [result["neg"], result["neu"], result["pos"], result["compound"]]

ddf = ddf.assign(sent_list = ddf.nlp_text.map(lambda x: add_senti(x)))
ddf = ddf.assign(f22_sentiment_neg = ddf.sent_list.map(lambda x: x[0]))
ddf = ddf.assign(f22_sentiment_neu = ddf.sent_list.map(lambda x: x[1]))
ddf = ddf.assign(f22_sentiment_pos = ddf.sent_list.map(lambda x: x[2]))
ddf = ddf.assign(f22_sentiment_compound = ddf.sent_list.map(lambda x: x[-1]))
ddf.drop("sent_list", axis=1)

start = time.time()
ddf.compute()
end = time.time()
print("Elapsed: " + str(end - start))

Elapsed: 2762.4644091129303


In [87]:
print(ddf.shape[0].compute())

8064206


In [90]:
parent = Path(constants.TWITTER_OUTPUT_RAW_PATH, 'sent_drop', "staging")
sent_drop_path = file_services.create_unique_folder_name(parent, prefix="sd")

ddf.to_parquet(path=str(sent_drop_path), engine="pyarrow", compression="snappy")