In [1]:
import sys

paths_to_add = ['/home/jovyan/work']

for p in paths_to_add:
    if p not in sys.path:
        sys.path.append(p)

print(sys.path)

['', '/home/jovyan/work/alpha_media_signal/notebooks/twitter', '/usr/local/spark/python', '/usr/local/spark/python/lib/py4j-0.10.7-src.zip', '/opt/conda/lib/python37.zip', '/opt/conda/lib/python3.7', '/opt/conda/lib/python3.7/lib-dynload', '/opt/conda/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages/IPython/extensions', '/home/jovyan/.ipython', '/home/jovyan/work']


In [2]:
from alpha_media_signal.services import spark_service
from pyspark.sql import SparkSession, udf
from pyspark.sql.functions import udf
from alpha_media_signal.services import twitter_service
from pyspark.sql import functions as F
from pathlib import Path
from pyspark.sql.types import StringType, StructType, StructField, BooleanType, MapType, ArrayType
import json
from typing import Dict, List
import re

spark = spark_service.get_or_create(app_name='twitter_flatten')

Setting up logging...
Will use logging path: /home/jovyan/work/data/logs/alpha_media_signal


In [3]:
data_path = Path('/home/jovyan/work/data/')

# twitter_folder = 'twitter'
twitter_folder = 'twitter_test'

file_path = Path(data_path, twitter_folder, 'raw_drop')

In [70]:
df_sample = (spark
  .read 
  .option("multiLine", True)
  .json("/home/jovyan/work/resources/sample_tweet.json"))
    
sample_tweet = json.loads(df_sample.toJSON().collect()[0])

tweet_schema = df_sample.schema

with open("/home/jovyan/work/resources/tweet_schema.json", '+w') as w:
    w.writelines(tweet_schema.json())

In [71]:
with open('/home/jovyan/work/resources/tweet_schema.json', 'r+') as r:
    json_str = r.readline()
    thing = json.loads(json_str)
    
    tweet_schema = StructType.fromJson(thing)
    
df = spark.read.json(str(file_path), schema=tweet_schema)
df.count()

194

In [72]:
df_text = spark.read.text(str(file_path))

df = df_text.limit(2)

search_tuples = twitter_service.get_ticker_searchable_tuples()

print(f'number of search tuples: {len(search_tuples)}')

# print(tweet_schema)

number of search tuples: 8601


In [73]:
sc = spark.sparkContext
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("pyspark script logger initialized")

In [74]:
def get_cashtag_info(ticker: str, has_cashtag: bool, ticker_in_text: bool) -> Dict:
    return {"ticker": ticker, "has_cashtag": has_cashtag, "ticker_in_text": ticker_in_text}


def get_cashtags(raw_line: str):
    tweet = json.loads(raw_line)
    text = tweet['text']
    cashtags_stock = []
    for s in search_tuples:
        ticker = s[0].strip()
        name = s[1].strip()

        if re.search(f'${ticker}', text) and re.search(name, text, re.IGNORECASE):
            cashtags_stock.append(get_cashtag_info(ticker=ticker, has_cashtag=True, ticker_in_text=True))

    if len(cashtags_stock) == 0:
        for s in search_tuples:
            ticker = s[0].strip()
            name = s[1].strip()

            if re.search(ticker, text) and re.search(name, text, re.IGNORECASE):
                cashtags_stock.append(get_cashtag_info(ticker=ticker, has_cashtag=True, ticker_in_text=True))

    if len(cashtags_stock) == 0:
        for s in search_tuples:
            ticker = s[0]
            name = s[1]
            if re.search(ticker, raw_line) and re.search(name, raw_line, re.IGNORECASE):
                cashtags_stock.append(get_cashtag_info(ticker=ticker, has_cashtag=True, ticker_in_text=True))

    return cashtags_stock

In [75]:
schema = ArrayType(StructType(fields=[StructField('ticker', StringType()),
                          StructField('has_cashtag', BooleanType()),
                          StructField('ticker_in_text', BooleanType())]))
get_cashtags_udf = udf(get_cashtags, schema)

df_renamed = df.withColumnRenamed('value', 'raw_json')

df_f22 = df_renamed.withColumn('f22_flagged_stocks', get_cashtags_udf(F.col('raw_json')))
df_exploded = df_f22.withColumn('f22_flagged_stock', F.explode(F.col('f22_flagged_stocks')))
df_f22_short = df_exploded.select(*['*', 
                                    'f22_flagged_stock.ticker', 
                                    'f22_flagged_stock.has_cashtag', 
                                    'f22_flagged_stock.ticker_in_text'
                                   ]).drop(*['f22_flagged_stocks', 'f22_flagged_stock'])

def transform_to_dict(raw_json):
    print(sample_tweet)
    return sample_tweet

transform_to_dict_udf = udf(transform_to_dict, tweet_schema)

df_blown = df_f22_short.withColumn('foo', transform_to_dict_udf(F.col('raw_json')))

df_blown.toPandas().head()

Unnamed: 0,raw_json,ticker,has_cashtag,ticker_in_text,foo
0,"{""created_at"": ""Sun Aug 16 19:53:29 +0000 2020...",AUTL,True,True,"(None, ([-71.2289641, 42.4430372], Point), Thu..."
1,"{""created_at"": ""Sun Aug 16 19:51:47 +0000 2020...",AUTL,True,True,"(None, ([-71.2289641, 42.4430372], Point), Thu..."


In [51]:
from pyspark.sql import functions as F
from pyspark.sql.types import  StringType

# F.col('place.full_name').alias('place_full_name').cast(StringType()),

sel_columns = ['created_at',
'id',
'text',
'truncated',
'source',
'in_reply_to_status_id',
'in_reply_to_user_id',
'in_reply_to_screen_name',
'place',
'contributors',
'is_quote_status',
'retweet_count',
'favorite_count',
'retweeted',
'possibly_sensitive',
'lang',
F.col('entities.user_mentions')[0].alias('entities_user_mentions_0').cast(StringType()),
F.col('entities.user_mentions')[1].alias('entities_user_mentions_1').cast(StringType()),
F.col('entities.user_mentions')[2].alias('entities_user_mentions_2').cast(StringType()),
F.col('entities.user_mentions')[3].alias('entities_user_mentions_3').cast(StringType()),
F.col('entities.urls')[0].alias('entities_urls_0').cast(StringType()),
F.col('entities.urls')[1].alias('entities_urls_1').cast(StringType()),
F.col('entities.urls')[2].alias('entities_urls_2').cast(StringType()),
F.col('entities.urls')[3].alias('entities_urls_3').cast(StringType()),
F.col('place.full_name').alias('place_full_name'),
F.col('place.country').alias('place_country'),
F.col('metadata.iso_language_code').alias('metadata_iso_language_code'),
F.col('metadata.result_type').alias('metadata_result_type'),
F.col('user.id').alias('user_id'),
F.col('user.name').alias('user_name'),
F.col('user.screen_name').alias('user_screen_name'),
F.col('user.location').alias('user_location'),
F.col('user.description').alias('user_description'),
F.col('user.url').alias('user_url'),
F.col('user.protected').alias('user_protected'),
F.col('user.followers_count').alias('user_followers_count'),
F.col('user.friends_count').alias('user_friends_count'),
F.col('user.listed_count').alias('user_listed_count'),
F.col('user.created_at').alias('user_created_at'),
F.col('user.favourites_count').alias('user_favourites_count'),
F.col('user.utc_offset').alias('user_utc_offset'),
F.col('user.time_zone').alias('user_time_zone'),
F.col('user.geo_enabled').alias('user_geo_enabled'),
F.col('user.verified').alias('user_verified'),
F.col('user.statuses_count').alias('user_statuses_count'),
F.col('user.lang').alias('user_lang'),
F.col('user.contributors_enabled').alias('user_contributors_enabled'),
F.col('user.is_translator').alias('user_is_translator'),
F.col('user.is_translation_enabled').alias('user_is_translation_enabled'),
F.col('user.profile_background_color').alias('user_profile_background_color'),
F.col('user.profile_background_image_url').alias('user_profile_background_image_url'),
F.col('user.profile_background_image_url_https').alias('user_profile_background_image_url_https'),
F.col('user.profile_background_tile').alias('user_profile_background_tile'),
F.col('user.profile_image_url').alias('user_profile_image_url'),
F.col('user.profile_image_url_https').alias('user_profile_image_url_https'),
F.col('user.profile_banner_url').alias('user_profile_banner_url'),
F.col('user.profile_link_color').alias('user_profile_link_color'),
F.col('user.profile_sidebar_border_color').alias('user_profile_sidebar_border_color'),
F.col('user.profile_sidebar_fill_color').alias('user_profile_sidebar_fill_color'),
F.col('user.profile_text_color').alias('user_profile_text_color'),
F.col('user.profile_use_background_image').alias('user_profile_use_background_image'),
F.col('user.has_extended_profile').alias('user_has_extended_profile'),
F.col('user.default_profile').alias('user_default_profile'),
F.col('user.default_profile_image').alias('user_default_profile_image'),
F.col('user.following').alias('user_following'),
F.col('user.follow_request_sent').alias('user_follow_request_sent'),
F.col('user.notifications').alias('user_notifications'),
F.col('user.translator_type').alias('user_translator_type')
]

df_flat = df.select(*sel_columns)
df_thin = df_flat.drop(*['user', 'entities', 'metadata', 'id_str', 'place'])

# df_thin.select(*['text']).toPandas().head(17)

AnalysisException: "cannot resolve '`created_at`' given input columns: [value];;\n'Project ['created_at, 'id, 'text, 'truncated, 'source, 'in_reply_to_status_id, 'in_reply_to_user_id, 'in_reply_to_screen_name, 'place, 'contributors, 'is_quote_status, 'retweet_count, 'favorite_count, 'retweeted, 'possibly_sensitive, 'lang, unresolvedalias(cast('entities.user_mentions[0] as string), None), unresolvedalias(cast('entities.user_mentions[1] as string), None), unresolvedalias(cast('entities.user_mentions[2] as string), None), unresolvedalias(cast('entities.user_mentions[3] as string), None), unresolvedalias(cast('entities.urls[0] as string), None), unresolvedalias(cast('entities.urls[1] as string), None), unresolvedalias(cast('entities.urls[2] as string), None), unresolvedalias(cast('entities.urls[3] as string), None), ... 44 more fields]\n+- GlobalLimit 2\n   +- LocalLimit 2\n      +- Relation[value#528] text\n"

In [None]:
import re

entity_comma = '&#44;'

line_ending_pattern = re.compile("[\r\n]")
def clean_text(text:str):
    result = text
    if text is not None and len(text) > 0:
        result = re.sub(line_ending_pattern, '', text)
        result = re.sub(",", entity_comma, result)
    return result
    
clean_text_udf = udf(clean_text, StringType())

df_clean = df_thin.withColumn("text", clean_text_udf(F.col("text")))\
    .withColumn("user_name", clean_text_udf(F.col("user_name")))\
    .withColumn("user_screen_name", clean_text_udf(F.col("user_screen_name")))\
    .withColumn("user_location", clean_text_udf(F.col("user_location")))\
    .withColumn("user_description", clean_text_udf(F.col("user_description")))\
    .withColumn("entities_user_mentions_0", clean_text_udf(F.col("entities_user_mentions_0")))\
    .withColumn("entities_user_mentions_1", clean_text_udf(F.col("entities_user_mentions_1")))\
    .withColumn("entities_user_mentions_2", clean_text_udf(F.col("entities_user_mentions_2")))\
    .withColumn("entities_user_mentions_3", clean_text_udf(F.col("entities_user_mentions_3")))\
    .withColumn("entities_urls_0", clean_text_udf(F.col("entities_urls_0")))\
    .withColumn("entities_urls_1", clean_text_udf(F.col("entities_urls_1")))\
    .withColumn("entities_urls_2", clean_text_udf(F.col("entities_urls_2")))\
    .withColumn("entities_urls_3", clean_text_udf(F.col("entities_urls_3")))\
    .withColumn("place_full_name", clean_text_udf(F.col("place_full_name")))\
    .withColumn("place_country", clean_text_udf(F.col("place_country")))\
    .withColumn("user_url", clean_text_udf(F.col("user_url")))\
    .withColumn("user_profile_background_image_url", clean_text_udf(F.col("user_profile_background_image_url")))\
    .withColumn("source", clean_text_udf(F.col("source")))\
    .withColumn("in_reply_to_screen_name", clean_text_udf(F.col("in_reply_to_screen_name")))\
    .dropDuplicates(['id'])

# df_clean.select(*['text']).toPandas().head(17)

In [None]:
print(df_clean.columns)

In [None]:
from alpha_media_signal.services import file_services, twitter_service

flat_drop_path = Path(data_path, twitter_folder, 'flattened_drop')

output_folder_path = file_services.get_unique_folder(str(flat_drop_path), prefix='tweets_flat', ensure_exists=False)

df_clean.repartition(5).write.option("header",True).option("quoteAll", True).csv(str(output_folder_path))

print(str(output_folder_path))


In [None]:
# df_read = spark.read.csv(str(output_folder_path), header=True)

In [None]:
# df_read.select(*['text']).toPandas().head(17)
df_clean.count()

In [None]:
import pandas as pd
from alpha_media_signal.services import file_services

csv_list = list(file_services.list_files(output_folder_path, ends_with=".csv"))

import csv

for c in csv_list:
    df_read = pd.read_csv(str(c), dialect=csv.unix_dialect(), error_bad_lines=False, index_col=False, dtype='unicode')
    print(df_read.shape[0])