In [1]:
import initialize_notebook

In [2]:
import sys
from pathlib import Path
from typing import Dict

from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, Row, StructField, DoubleType, ArrayType, StringType, FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from ams.config import constants
from ams.services import file_services
from ams.services import spark_service
from ams.utils import sentiment
import findspark

findspark.init()
spark = spark_service.get_or_create('assign_sentiment_and_stock')

Setting up logging...
Will use logging path: C:\Users\Chris\workspaces\data\logs\alpha_media_signal


In [3]:
twitter_folder = 'twitter'

flat_tweet_path = Path(constants.DATA_PATH, twitter_folder, r'flattened_drop', "tweets_flat_2020-10-25_16-36-14-314.4")

In [4]:
from pyspark.sql import functions as F
import time

start = time.time()
df_red =  spark.read.csv(str(flat_tweet_path), header=True)
print(f'df_red count: {df_red.count()}')
end = time.time()

print(f'Total load time; {end - start} seconds')

df_red count: 930308
Total load time; 8.728724241256714 seconds


In [5]:
df_red[df_red["f22_ticker"] == "DKNG"].count()

0

In [6]:
%%time

analyzer = SentimentIntensityAnalyzer()
def get_sentiment_intensity_score(text: str) -> Dict:
    return analyzer.polarity_scores(text)

sent_udf = udf(get_sentiment_intensity_score, StructType(fields=[StructField("neg", StringType()),
                                                                           StructField("neu", StringType()),
                                                                           StructField("pos", StringType()),
                                                                           StructField("compound", StringType())
                                                                           ]))

start = time.time()
df_sent = df_red.where(F.col('text').isNotNull()).withColumn('tmp_sentiment', sent_udf(F.col('text')))\
                .withColumn('f22_sentiment_pos', F.col('tmp_sentiment.neg'))\
                .withColumn('f22_sentiment_neu', F.col('tmp_sentiment.neu'))\
                .withColumn('f22_sentiment_neg', F.col('tmp_sentiment.pos'))\
                .withColumn('f22_sentiment_compound', F.col('tmp_sentiment.compound'))\
                .drop('tmp_sentiment')
                
df_final = df_sent
count = df_sent.count()
end = time.time()

sent_per_sec = count / (end - start)

print(f'{sent_per_sec} per second.')

df_tmp = df_sent.where(F.col("f22_sentiment_compound").isNotNull())

print(f'Not null found: {df_tmp.count()} rows')

690139.4404860908 per second.
Not null found: 930308 rows
Wall time: 15.2 s


In [7]:
%%time

from ams.services import dataframe_services

sent_drop_path = Path(constants.DATA_PATH, twitter_folder, 'sent_drop')

dataframe_services.persist_dataframe_as_csv(df=df_tmp, output_drop_folder_path=sent_drop_path, prefix='tweet_sent')

C:\Users\Chris\workspaces\data\twitter\sent_drop\tweet_sent_2020-10-25_17-01-21-632.26
Wall time: 1min 1s
