In [36]:
import findspark, pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [37]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [38]:
# Create the Spark session

findspark.init()
findspark.find()

spark = SparkSession\
        .builder\
        .appName("ADBFinacialReportsSecDataAnalysis")\
        .config("spark.sql.shuffle.partitions",200)\
        .config("spark.driver.memory", "16G")\
        .config("spark.sql.repl.eagereval.enabled",True)\
        .getOrCreate()


In [39]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [40]:
data_dir = "../Datasets/financial-reports-sec/parquet/large/"

In [41]:
df_test = spark.read.parquet(data_dir + "test.parquet")
df_train = spark.read.parquet(data_dir + "train.parquet")

In [42]:
df_test = df_test.withColumn("report_length", F.length(F.col("report")))
df_train = df_train.withColumn("report_length", F.length(F.col("report")))

df_test = df_test.withColumn("report_word_count", F.size(F.split(F.col('report'), ' ')))
df_train = df_train.withColumn("report_word_count", F.size(F.split(F.col('report'), ' ')))

In [43]:
print("Test Size: " + str(df_test.count()))
print("Train Size: " + str(df_train.count()))


Test Size: 5690
Train Size: 22007


In [44]:
df_test.printSchema()

root
 |-- name: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- report: string (nullable = true)
 |-- report_length: integer (nullable = true)
 |-- report_word_count: integer (nullable = false)



In [45]:
df_test.groupBy("label").count().orderBy(F.col("label").asc()).show()
df_train.groupBy("label").count().orderBy(F.col("label").asc()).show()

+-----+-----+
|label|count|
+-----+-----+
|    0| 1709|
|    1|  360|
|    2|  267|
|    3|  518|
|    4|  780|
|    5|  253|
|    6|  439|
|    7| 1364|
+-----+-----+

+-----+-----+
|label|count|
+-----+-----+
|    0| 6448|
|    1| 1395|
|    2|  951|
|    3| 2084|
|    4| 2956|
|    5|  996|
|    6| 1723|
|    7| 5454|
+-----+-----+



In [46]:
df_test.groupBy("name").count().orderBy(F.col("count").desc()).show()
df_train.groupBy("name").count().orderBy(F.col("count").desc()).show()

+--------------------+-----+
|                name|count|
+--------------------+-----+
|         ABIOMED INC|    9|
|HELIX ENERGY SOLU...|    8|
|SUPERIOR GROUP OF...|    8|
| TTEC Holdings, Inc.|    8|
|SECURITY NATIONAL...|    8|
|PROGRESSIVE CORP/OH/|    8|
|STANDARD MOTOR PR...|    7|
|              HP INC|    7|
|   ENTERGY CORP /DE/|    7|
|         ALICO, INC.|    7|
|          NIKE, Inc.|    6|
|      FORMFACTOR INC|    6|
|         BELDEN INC.|    6|
|     BCB BANCORP INC|    6|
|        TELEFLEX INC|    6|
|         Yellow Corp|    6|
| EMERSON ELECTRIC CO|    6|
|     VISTA GOLD CORP|    6|
|BASSETT FURNITURE...|    6|
|QUAINT OAK BANCOR...|    6|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|                name|count|
+--------------------+-----+
|     E.W. SCRIPPS Co|   19|
|   EVERSOURCE ENERGY|   18|
|          INTEL CORP|   18|
|VISHAY INTERTECHN...|   18|
|LINCOLN NATIONAL ...|   18|
|   OLYMPIC STEEL INC|   17|
|          CABOT 

In [48]:
df_test.select(["name", "label", "report_length"]).orderBy(F.col("report_length").desc()).show()
df_train.select(["name", "label", "report_length"]).orderBy(F.col("report_length").desc()).show()

                                                                                

+--------------------+-----+-------------+
|                name|label|report_length|
+--------------------+-----+-------------+
|            PPL Corp|    0|      1291267|
|    OGE ENERGY CORP.|    4|      1200404|
|ICAHN ENTERPRISES...|    3|      1118668|
|Santander Holding...|    0|       996139|
|   ENTERGY CORP /DE/|    6|       992684|
|Spectrum Brands H...|    0|       977127|
|  FIRST BANCORP /PR/|    4|       934741|
|Encompass Health ...|    0|       871905|
|  FIRST BANCORP /PR/|    0|       869399|
|Ares Management Corp|    3|       821444|
|ASSURED GUARANTY LTD|    4|       819222|
|       BeiGene, Ltd.|    0|       808937|
|Nabriva Therapeut...|    0|       761072|
|           KBR, INC.|    1|       753114|
|OCULAR THERAPEUTI...|    1|       728845|
| Ladder Capital Corp|    0|       717484|
|      Zymeworks Inc.|    7|       714457|
|     Sesen Bio, Inc.|    4|       701976|
|           VISA INC.|    7|       688992|
|CRISPR Therapeuti...|    0|       686282|
+----------



+--------------------+-----+-------------+
|                name|label|report_length|
+--------------------+-----+-------------+
|Spectrum Brands H...|    6|      1353829|
|Spectrum Brands H...|    0|      1329109|
| PILGRIMS PRIDE CORP|    3|      1205252|
|ICAHN ENTERPRISES...|    4|      1137655|
|         EXELON CORP|    7|      1124186|
|LAUREATE EDUCATIO...|    3|      1113577|
|ICAHN ENTERPRISES...|    7|      1103391|
|ICAHN ENTERPRISES...|    0|      1089811|
|      YUM BRANDS INC|    4|      1049034|
|Santander Holding...|    1|      1025449|
|GENWORTH FINANCIA...|    0|       918216|
|GENWORTH FINANCIA...|    1|       913750|
|ENTERGY MISSISSIP...|    7|       910612|
|   General Motors Co|    0|       893327|
|Cerevel Therapeut...|    0|       890453|
|Ares Management Corp|    7|       889773|
|  FIRST BANCORP /PR/|    4|       880972|
|EDISON INTERNATIONAL|    4|       873150|
|  FIRST BANCORP /PR/|    1|       870625|
|Nabriva Therapeut...|    3|       846518|
+----------

                                                                                

In [49]:
df_test.select(["name", "label", "report_word_count"]).orderBy(F.col("report_word_count").desc()).show()
df_train.select(["name", "label", "report_word_count"]).orderBy(F.col("report_word_count").desc()).show()

                                                                                

+--------------------+-----+-----------------+
|                name|label|report_word_count|
+--------------------+-----+-----------------+
|            PPL Corp|    0|           198892|
|    OGE ENERGY CORP.|    4|           178923|
|ICAHN ENTERPRISES...|    3|           169582|
|   ENTERGY CORP /DE/|    6|           154733|
|Santander Holding...|    0|           152979|
|Spectrum Brands H...|    0|           147769|
|  FIRST BANCORP /PR/|    4|           141586|
|Encompass Health ...|    0|           131726|
|  FIRST BANCORP /PR/|    0|           130861|
|Ares Management Corp|    3|           126643|
|ASSURED GUARANTY LTD|    4|           124645|
|       BeiGene, Ltd.|    0|           122163|
|Nabriva Therapeut...|    0|           116534|
|           KBR, INC.|    1|           115290|
| Ladder Capital Corp|    0|           112081|
|OCULAR THERAPEUTI...|    1|           110879|
|      Zymeworks Inc.|    7|           107619|
|     Sesen Bio, Inc.|    4|           107223|
|           V



+--------------------+-----+-----------------+
|                name|label|report_word_count|
+--------------------+-----+-----------------+
|Spectrum Brands H...|    6|           204200|
|Spectrum Brands H...|    0|           201475|
| PILGRIMS PRIDE CORP|    3|           195979|
|ICAHN ENTERPRISES...|    4|           172959|
|LAUREATE EDUCATIO...|    3|           170723|
|         EXELON CORP|    7|           169022|
|ICAHN ENTERPRISES...|    7|           166764|
|ICAHN ENTERPRISES...|    0|           164617|
|      YUM BRANDS INC|    4|           161784|
|Santander Holding...|    1|           157264|
|GENWORTH FINANCIA...|    0|           138369|
|ENTERGY MISSISSIP...|    7|           138264|
|   General Motors Co|    0|           137807|
|Ares Management Corp|    7|           137365|
|GENWORTH FINANCIA...|    1|           136737|
|Cerevel Therapeut...|    0|           134816|
|EDISON INTERNATIONAL|    4|           134688|
|  FIRST BANCORP /PR/|    4|           132941|
|    OGE ENER

                                                                                