In [1]:
import findspark, pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Create the Spark session

findspark.init()
findspark.find()

spark = SparkSession\
        .builder\
        .appName("HomeCreditDefaultRisk")\
        .config("spark.sql.shuffle.partitions",6)\
        .config("spark.driver.memory", "12G")\
        .config("spark.sql.repl.eagereval.enabled",True)\
        .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/19 20:11:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [6]:
data_dir = "../Datasets/financial-reports-sec/data/"
output_dir = "../Datasets/financial-reports-sec/parquet/"
data_sizes = ["small", "large"]
data_types = ["test", "train", "validate"]

In [7]:
# List of sections to concatenate
sections = ["filing.report.section_1", "filing.report.section_1A", "filing.report.section_1B",
            "filing.report.section_2", "filing.report.section_3", "filing.report.section_4",
            "filing.report.section_5", "filing.report.section_6", "filing.report.section_7",
            "filing.report.section_7A", "filing.report.section_8", "filing.report.section_9",
            "filing.report.section_9A", "filing.report.section_9B", "filing.report.section_10",
            "filing.report.section_11", "filing.report.section_12", "filing.report.section_13",
            "filing.report.section_14", "filing.report.section_15"]

In [8]:
def concat_sections(col_list):
  # Concatenate elements in each array into a single string for each section
  concat_array_cols = [F.concat_ws(" ", F.col(section)).alias(section) for section in col_list]
  
  return F.concat_ws(" ", *concat_array_cols).alias("full_text")

In [9]:
def df_transform(df):
  # Explode the filings array to get one row per filing
  df_exploded = df.withColumn("filing", F.explode("filings")).drop("filings")

  # Concatenate all sections into one single text field
  df_exploded = df_exploded.withColumn("full_text", concat_sections(sections))

  # Extract labels, dates and returns fields
  df_exploded = df_exploded \
  .withColumn("labels_1d", F.col("filing.labels.1d")) \
  .withColumn("labels_30d", F.col("filing.labels.30d")) \
  .withColumn("labels_5d", F.col("filing.labels.5d")) \
  .withColumn("returns_1d_closePriceEndDate", F.col("filing.returns.1d.closePriceEndDate")) \
  .withColumn("returns_1d_closePriceStartDate", F.col("filing.returns.1d.closePriceStartDate")) \
  .withColumn("returns_1d_endDate", F.col("filing.returns.1d.endDate")) \
  .withColumn("returns_1d_ret", F.col("filing.returns.1d.ret")) \
  .withColumn("returns_1d_startDate", F.col("filing.returns.1d.startDate")) \
  .withColumn("returns_30d_closePriceEndDate", F.col("filing.returns.30d.closePriceEndDate")) \
  .withColumn("returns_30d_closePriceStartDate", F.col("filing.returns.30d.closePriceStartDate")) \
  .withColumn("returns_30d_endDate", F.col("filing.returns.30d.endDate")) \
  .withColumn("returns_30d_ret", F.col("filing.returns.30d.ret")) \
  .withColumn("returns_30d_startDate", F.col("filing.returns.30d.startDate")) \
  .withColumn("returns_5d_closePriceEndDate", F.col("filing.returns.5d.closePriceEndDate")) \
  .withColumn("returns_5d_closePriceStartDate", F.col("filing.returns.5d.closePriceStartDate")) \
  .withColumn("returns_5d_endDate", F.col("filing.returns.5d.endDate")) \
  .withColumn("returns_5d_ret", F.col("filing.returns.5d.ret")) \
  .withColumn("returns_5d_startDate", F.col("filing.returns.5d.startDate")) \
  .withColumn("acceptanceDateTime", F.col("filing.acceptanceDateTime")) \
  .withColumn("filingDate", F.col("filing.filingDate")) \
  .withColumn("reportDate", F.col("filing.reportDate")) \
  .withColumn("form", F.col("filing.form")) \
  .drop("filing")

  return df_exploded

In [10]:
for size in data_sizes:
  for type in data_types:
    filename = data_dir + size + "/" + type
    output = output_dir + size + "/" + type + ".parquet"
    df = spark.read.json(filename)
    df = df_transform(df)
    df = df.withColumn("full_text_length", F.length(F.col("full_text")))
    print(size + "." + type)
    df.select("cik", "full_text_length").show(truncate=False)
    df = df.drop("full_text_length")
    print(df.count())
    df.write.mode("overwrite").parquet(output)

# df.write.format("json").mode("overwrite").save("tmp/json_data")

small.test
+----------+----------------+
|cik       |full_text_length|
+----------+----------------+
|0001602658|387409          |
|0001602658|430892          |
|0001602658|436873          |
|0001602658|400906          |
|0001602658|423870          |
|0001602658|363616          |
|0001602658|370596          |
|0001603145|268967          |
|0001603145|300016          |
|0001603145|91466           |
|0001602813|44500           |
|0001602813|49774           |
|0001602813|51027           |
|0001602813|60283           |
+----------+----------------+

14


24/05/19 20:12:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

small.train
+----------+----------------+
|cik       |full_text_length|
+----------+----------------+
|0000002488|264393          |
|0000002488|164104          |
|0000002488|308715          |
|0000002488|367277          |
|0000002488|350526          |
|0000002488|323278          |
|0000002488|311993          |
|0000002488|294435          |
|0000002488|297597          |
|0000002488|338345          |
|0000002488|379762          |
|0000002488|396306          |
|0000002488|492154          |
|0000002488|540034          |
|0000002488|538078          |
|0000002488|440732          |
|0000002488|416625          |
|0000002488|304806          |
|0000002488|250897          |
|0000002488|71367           |
+----------+----------------+
only showing top 20 rows

188
small.validate
+----------+----------------+
|cik       |full_text_length|
+----------+----------------+
|0001699150|399171          |
|0001699150|366235          |
|0001699150|375938          |
|0001699150|307374          |
|0001699136|3

                                                                                

large.train
+----------+----------------+
|cik       |full_text_length|
+----------+----------------+
|0000001961|113500          |
|0000001961|105416          |
|0000001961|103792          |
|0000001961|96718           |
|0000001961|105987          |
|0000001961|68409           |
|0000001961|66589           |
|0000001961|64647           |
|0000001961|56478           |
|0000001961|222507          |
|0000001961|105681          |
|0000001961|104978          |
|0000001961|79863           |
|0000003570|317495          |
|0000003570|350513          |
|0000003570|330979          |
|0000003570|338654          |
|0000003570|336623          |
|0000003570|311212          |
|0000003570|317898          |
+----------+----------------+
only showing top 20 rows



                                                                                

52663


                                                                                

large.validate
+----------+----------------+
|cik       |full_text_length|
+----------+----------------+
|0001699150|399171          |
|0001699150|366235          |
|0001699150|375938          |
|0001699150|500846          |
|0001701732|386310          |
|0001701732|406218          |
|0001701732|407962          |
|0001701732|359322          |
|0001703073|114735          |
|0001703073|74832           |
|0001705682|356882          |
|0001705682|349997          |
|0001705682|323060          |
|0001705682|343486          |
|0001707925|228973          |
|0001707925|294159          |
|0001707925|287521          |
|0001707925|104090          |
|0001709164|191527          |
|0001709164|168253          |
+----------+----------------+
only showing top 20 rows

866


In [11]:
spark.stop()