In [1]:
import sys

paths_to_add = ['/home/jovyan/work']

for p in paths_to_add:
    if p not in sys.path:
        sys.path.append(p)

print(sys.path)

['', '/home/jovyan/work/ams/notebooks/twitter', '/usr/local/spark/python', '/usr/local/spark/python/lib/py4j-0.10.7-src.zip', '/opt/conda/lib/python37.zip', '/opt/conda/lib/python3.7', '/opt/conda/lib/python3.7/lib-dynload', '/opt/conda/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages/IPython/extensions', '/home/jovyan/.ipython', '/home/jovyan/work']


In [2]:
from ams.services import spark_service
from pyspark.sql import SparkSession, udf
from pyspark.sql.functions import udf, struct
from ams.services import twitter_service
from pyspark.sql import functions as F
from pathlib import Path
from pyspark.sql.types import StringType, StructType, StructField, BooleanType, MapType, ArrayType, Row
import json
from typing import Dict, List
import re

spark = spark_service.get_or_create(app_name='twitter_flatten')

sc = spark.sparkContext
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("pyspark script logger initialized")

Setting up logging...
Will use logging path: /home/jovyan/work/data/logs/alpha_media_signal


In [3]:
data_path = Path('/home/jovyan/work/data/')

twitter_folder = 'twitter'
# twitter_folder = 'twitter_test'

file_path = Path(data_path, twitter_folder, 'raw_drop', 'stage')

In [4]:
from ams.services import schema_service

sample_tweet_path = Path("/home/jovyan/work/resources/sample_tweet.json")
tweet_schema = schema_service.get_twitter_schema(spark=spark, twitter_sample_path=sample_tweet_path)

In [5]:
import re

entity_comma = '&#44;'
line_ending_pattern = re.compile("[\r\n]")
def clean_text(text:str):
    result = text
    if text is not None and len(text) > 0:
        result = re.sub(line_ending_pattern, '', text)
        result = re.sub(",", entity_comma, result)
    return result
clean_text_udf = udf(clean_text, StringType())

def get_cashtag_info(ticker: str, has_cashtag: bool, ticker_in_text: bool) -> Dict:
    return {"ticker": ticker, "has_cashtag": has_cashtag, "ticker_in_text": ticker_in_text}


In [7]:
df_init = spark.read.json(str(file_path) + "/*.txt")

# df_init = df_init.limit(100).repartition(25)
print(f'Number of rows: {df_init.count()}')

Number of rows: 775817


In [8]:
search_tuples = twitter_service.get_ticker_searchable_tuples()

print(f'number of search tuples: {len(search_tuples)}')

number of search tuples: 8601


In [9]:
df_unduped = df_init.dropDuplicates(['id'])
print(df_unduped.count())

773369


In [10]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType

# F.col('place.full_name').alias('place_full_name').cast(StringType()),

sel_columns = ['created_at',
'id',
'text',
'truncated',
'source',
'in_reply_to_status_id',
'in_reply_to_user_id',
'in_reply_to_screen_name',
'contributors',
'is_quote_status',
'retweet_count',
'favorite_count',
'retweeted',
'possibly_sensitive',
'lang',
F.col('entities.user_mentions')[0].alias('entities_user_mentions_0').cast(StringType()),
F.col('entities.user_mentions')[1].alias('entities_user_mentions_1').cast(StringType()),
F.col('entities.user_mentions')[2].alias('entities_user_mentions_2').cast(StringType()),
F.col('entities.user_mentions')[3].alias('entities_user_mentions_3').cast(StringType()),
F.col('entities.urls')[0].alias('entities_urls_0').cast(StringType()),
F.col('entities.urls')[1].alias('entities_urls_1').cast(StringType()),
F.col('entities.urls')[2].alias('entities_urls_2').cast(StringType()),
F.col('entities.urls')[3].alias('entities_urls_3').cast(StringType()),
F.col('metadata.iso_language_code').alias('metadata_iso_language_code'),
F.col('metadata.result_type').alias('metadata_result_type'),
F.col('user.id').alias('user_id'),
F.col('user.name').alias('user_name'),
F.col('user.screen_name').alias('user_screen_name'),
F.col('user.location').alias('user_location'),
F.col('user.description').alias('user_description'),
F.col('user.url').alias('user_url'),
F.col('user.protected').alias('user_protected'),
F.col('user.followers_count').alias('user_followers_count').cast(IntegerType()),
F.col('user.friends_count').alias('user_friends_count').cast(IntegerType()),
F.col('user.listed_count').alias('user_listed_count'),
F.col('user.created_at').alias('user_created_at'),
F.col('user.favourites_count').alias('user_favourites_count').cast(IntegerType()),
F.col('user.utc_offset').alias('user_utc_offset'),
F.col('user.time_zone').alias('user_time_zone'),
F.col('user.geo_enabled').alias('user_geo_enabled'),
F.col('user.verified').alias('user_verified'),
F.col('user.statuses_count').alias('user_statuses_count').cast(IntegerType()),
F.col('user.lang').alias('user_lang'),
F.col('user.contributors_enabled').alias('user_contributors_enabled'),
F.col('user.is_translator').alias('user_is_translator'),
F.col('user.is_translation_enabled').alias('user_is_translation_enabled'),
F.col('user.profile_background_color').alias('user_profile_background_color'),
F.col('user.profile_background_image_url').alias('user_profile_background_image_url'),
F.col('user.profile_background_image_url_https').alias('user_profile_background_image_url_https'),
F.col('user.profile_background_tile').alias('user_profile_background_tile'),
F.col('user.profile_image_url').alias('user_profile_image_url'),
F.col('user.profile_image_url_https').alias('user_profile_image_url_https'),
F.col('user.profile_banner_url').alias('user_profile_banner_url'),
F.col('user.profile_link_color').alias('user_profile_link_color'),
F.col('user.profile_sidebar_border_color').alias('user_profile_sidebar_border_color'),
F.col('user.profile_sidebar_fill_color').alias('user_profile_sidebar_fill_color'),
F.col('user.profile_text_color').alias('user_profile_text_color'),
F.col('user.profile_use_background_image').alias('user_profile_use_background_image'),
F.col('user.has_extended_profile').alias('user_has_extended_profile'),
F.col('user.default_profile').alias('user_default_profile'),
F.col('user.default_profile_image').alias('user_default_profile_image'),
F.col('user.following').alias('user_following'),
F.col('user.follow_request_sent').alias('user_follow_request_sent'),
F.col('user.notifications').alias('user_notifications'),
F.col('user.translator_type').alias('user_translator_type'),
F.col('place.country').alias('place_country').cast(StringType()),
F.col('place.name').alias('place_name').cast(StringType())
]

# print(df_init.columns)

df_flat = df_unduped.select(*sel_columns)
df_thin = df_flat.drop(*['user', 'metadata', 'entities'])

# df_thin.select(*['id', 'place_country', 'place_name']).toPandas().head(17)

In [11]:
df_clean = df_thin.withColumn("text", clean_text_udf(F.col("text")))\
    .withColumn("user_name", clean_text_udf(F.col("user_name")))\
    .withColumn("user_screen_name", clean_text_udf(F.col("user_screen_name")))\
    .withColumn("user_location", clean_text_udf(F.col("user_location")))\
    .withColumn("user_description", clean_text_udf(F.col("user_description")))\
    .withColumn("entities_user_mentions_0", clean_text_udf(F.col("entities_user_mentions_0")))\
    .withColumn("entities_user_mentions_1", clean_text_udf(F.col("entities_user_mentions_1")))\
    .withColumn("entities_user_mentions_2", clean_text_udf(F.col("entities_user_mentions_2")))\
    .withColumn("entities_user_mentions_3", clean_text_udf(F.col("entities_user_mentions_3")))\
    .withColumn("entities_urls_0", clean_text_udf(F.col("entities_urls_0")))\
    .withColumn("entities_urls_1", clean_text_udf(F.col("entities_urls_1")))\
    .withColumn("entities_urls_2", clean_text_udf(F.col("entities_urls_2")))\
    .withColumn("entities_urls_3", clean_text_udf(F.col("entities_urls_3")))\
    .withColumn("place_name", clean_text_udf(F.col("place_name")))\
    .withColumn("place_country", clean_text_udf(F.col("place_country")))\
    .withColumn("user_url", clean_text_udf(F.col("user_url")))\
    .withColumn("user_profile_background_image_url", clean_text_udf(F.col("user_profile_background_image_url")))\
    .withColumn("source", clean_text_udf(F.col("source")))\
    .withColumn("in_reply_to_screen_name", clean_text_udf(F.col("in_reply_to_screen_name")))\
    .dropDuplicates(['id'])

df_clean.select(*['text']).limit(17).toPandas().head(17)

Unnamed: 0,text
0,RT @boujeebx: *whistle* 🖐🏼😯🤚🏼 ‘fore I go broke...
1,RT @quiIlcy: There's literally not even one vl...
2,RT @fussybabybitch: Stealthily squirting a fis...
3,RT @mooniiez: 🌙 CHASE (THE BOYZ) GROUP ORDER: ...
4,@Marc_crescendo @BriannaPulido2 She is young. ...
5,RT @financial_kf: Waste Management $WMClorox $...
6,RT @elynam_demigod: 😂😂😂😂#BlazeDem He say. wa...
7,RT @sam_aroha: #AROHA let's get 1M soon on off...
8,RT @hancyxo: big flex
9,United Natural Foods to Release Fiscal 2020 Fo...


In [12]:
search_tuples = twitter_service.get_ticker_searchable_tuples()

In [13]:
columns_to_search = ['text', 'source', 'entities_user_mentions_0', 'entities_user_mentions_1', 'entities_user_mentions_2', 'entities_user_mentions_3', 'entities_urls_0', 'entities_urls_1', 'entities_urls_2', 'entities_urls_3', 'user_description', 'user_url']

lc_cols = []
for c in columns_to_search:
    lc_cols.append(f'{c}_lc')
    df_clean = df_clean.withColumn(f'{c}_lc', F.lower(F.col(c)))


print(df_clean.columns)

['created_at', 'id', 'text', 'truncated', 'source', 'in_reply_to_status_id', 'in_reply_to_user_id', 'in_reply_to_screen_name', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'retweeted', 'possibly_sensitive', 'lang', 'entities_user_mentions_0', 'entities_user_mentions_1', 'entities_user_mentions_2', 'entities_user_mentions_3', 'entities_urls_0', 'entities_urls_1', 'entities_urls_2', 'entities_urls_3', 'metadata_iso_language_code', 'metadata_result_type', 'user_id', 'user_name', 'user_screen_name', 'user_location', 'user_description', 'user_url', 'user_protected', 'user_followers_count', 'user_friends_count', 'user_listed_count', 'user_created_at', 'user_favourites_count', 'user_utc_offset', 'user_time_zone', 'user_geo_enabled', 'user_verified', 'user_statuses_count', 'user_lang', 'user_contributors_enabled', 'user_is_translator', 'user_is_translation_enabled', 'user_profile_background_color', 'user_profile_background_image_url', 'user_profile_background_image_url

In [14]:
from pyspark.sql.functions import explode
from pyspark.sql.types import IntegerType

def get_cashtags_row_wise(row: Row):
    cashtags_stock = []
    
    row_dict = row.asDict()
    all_thing = ''
    
    text = ''
    for k in row_dict.keys():
        if k.endswith('_lc'):
            if k == 'text_lc':
                text = row_dict[k]
                if text is None:
                    text = ''
                text_len = len(str(text))
            else:
                cell = row_dict[k]
                cell = '' if cell is None else cell
                
                if type(cell) != 'str':
                    cell = str(cell)
                    
                if cell is None:
                    cell = ''
                all_thing += cell 
    all_thing = text + all_thing
            
    for s in search_tuples:
        ticker = s[0]
        ticker_lc = ticker.lower()
        name_lc = s[1].lower()
        
        index = all_thing.find(f'${ticker_lc}')
        if index > -1:
            ticker_in_text = True if index < text_len else False
            cashtags_stock.append(get_cashtag_info(ticker=ticker, has_cashtag=True, ticker_in_text=ticker_in_text))
        else:
            index_ticker = all_thing.find(ticker_lc)
            index_name = all_thing.find(name_lc)
            
            if index_ticker > -1 and index_name > -1:
                ticker_in_text = True if index_ticker < text_len else False
                cashtags_stock.append(get_cashtag_info(ticker=ticker, has_cashtag=False, ticker_in_text=ticker_in_text))
                
        num_other_tickers = len(cashtags_stock) - 1
        for tag in cashtags_stock:
            tag['num_other_tickers_in_tweet'] = num_other_tickers
    
    return cashtags_stock
          
schema = ArrayType(StructType(fields=[StructField('ticker', StringType()),
                                      StructField('has_cashtag', BooleanType()),
                                      StructField('ticker_in_text', BooleanType()),
                                      StructField('num_other_tickers_in_tweet', IntegerType())
                                     ]))
get_cashtags_row_wise_udf = udf(get_cashtags_row_wise, schema)

# df_tmp = df_clean.limit(10)

df_f22_flagged = df_clean.withColumn("f22", get_cashtags_row_wise_udf((struct([df_clean[x] for x in df_clean.columns]))))

df_f22_exploded = df_f22_flagged.withColumn('f22', explode(F.col('f22')))

se_columns = list(set(df_f22_exploded.columns) - set(lc_cols)) + [F.col('f22.ticker').alias('f22_ticker'),
                                        F.col('f22.has_cashtag').alias('f22_has_cashtag'),
                                        F.col('f22.ticker_in_text').alias('f22_ticker_in_text'),
                                        F.col('f22.num_other_tickers_in_tweet').alias('f22_num_other_tickers_in_tweet')
                                       ]

df_tickered = df_f22_exploded.select(*se_columns).drop('f22')

import time

start = time.time()
total = df_tickered.count()
print(f'Count: {total}')
end = time.time()

sec_per_record = total / (end - start)

print(f'Elapsed: {sec_per_record} per second.')

Count: 909703
Elapsed: 80.37263621755277 per second.


In [15]:
from ams.services import dataframe_services

flat_drop_path = Path(data_path, twitter_folder, 'flattened_drop')
prefix = "tweets_flat"

dataframe_services.persist_dataframe_as_csv(df=df_tickered, output_drop_folder_path=flat_drop_path, prefix=prefix)

/home/jovyan/work/data/twitter/flattened_drop/tweets_flat_2020-09-20_05-46-31-96.9


In [None]:
df_read = spark.read.csv(str(output_folder_path), header=True)

In [None]:
# df_read.select(*['text']).toPandas().head(17)
df_clean.count()

In [None]:
import pandas as pd
from ams.services import file_services

csv_list = list(file_services.list_files(output_folder_path, ends_with=".csv"))

import csv

for c in csv_list:
    if c.stat().st_size > 0:
        df_read = pd.read_csv(str(c), dialect=csv.unix_dialect(), error_bad_lines=False, index_col=False, dtype='unicode')
        print(df_read.shape[0])