**We filter and analyze our datasets based on the following critera:**

- we have meta information about the ticker
- we limit our analysis to stocks in euro
- we focus on the close market value
    - *once analysts process events*



In [None]:
import findspark
findspark.init()

In [2]:
import pyspark.sql.functions as F

from pyspark.sql import SparkSession

In [3]:
session = (SparkSession
           .builder
           .master('local[*]')
           .appName('analysis')
           .config('spark.driver.memory', '12g')
           .config('spark.executor.memory', '12g')
           .config('spark.driver.maxResultSize', '12g')
           .getOrCreate())

# Extras

In [4]:
extras = (session.read
          .csv('../datasets/extras.csv.gz', header=True, inferSchema=True)
          .filter('currency == "EUR"').dropDuplicates(['ticker'])
          .select('ticker', 'name', 'country', 'category name')
          .withColumnRenamed('category name', 'category')
          .cache())

print(extras.count())

extras.show()

26969
+-------+--------------------+--------------+--------+
| ticker|                name|       country|category|
+-------+--------------------+--------------+--------+
| 0BN.BE|HABIT RESTAURANT....|       Germany|    null|
| 0O2W.L| GFT Technologies SE|United Kingdom|    null|
|  13J.F|Jinmao (China) Ho...|        France|    null|
| 1TT.BE|   FACTOR THERAPEUT.|       Germany|    null|
|  233.F|Springland Intern...|        France|    null|
|2CDA.BE|CONTR.VUEL.CO. AD...|       Germany|    null|
| 3NEN.F|Brisio Innovation...|        France|    null|
| 44C.SG|Civeo Corp (Canad...|       Germany|    null|
|48CA.DU|   CAIXABANK S.A. EO|       Germany|    null|
| 4FO.MU|INDS PENOLES S.A....|       Germany|    null|
| 4HP.DU|ISETAN MITSUKOSHI...|       Germany|    null|
|  51S.F|Cynata Therapeuti...|        France|    null|
| 5CF.SG|Calfrac Well Serv...|       Germany|    null|
| 5IX.BE|INTERXION HLDG NV...|       Germany|    null|
|5N91.BE|AGUIA RES LTD AD-,20|       Germany|    null|
|  6

In [5]:
extras.summary().show()

+-------+------------------+--------------------+--------------+--------------------+
|summary|            ticker|                name|       country|            category|
+-------+------------------+--------------------+--------------+--------------------+
|  count|             26969|               26836|         26767|                5091|
|   mean|236.97560975609755|                null|          null|                null|
| stddev| 194.2619993468715|                null|          null|                null|
|    min|           013A.BE|"""Surgutneftegas...|       Austria|Accident & Health...|
|    25%|             109.0|                null|          null|                null|
|    50%|             210.0|                null|          null|                null|
|    75%|             304.0|                null|          null|                null|
|    max|           ZZMS.SG|üstra Hannoversch...|United Kingdom|Wireless Communic...|
+-------+------------------+--------------------+-----

In [6]:
extras.groupBy('country').count().sort('count', ascending=False).limit(10).show()

+--------------+-----+
|       country|count|
+--------------+-----+
|       Germany|16608|
|        France| 8665|
|         Italy|  394|
|United Kingdom|  239|
|          null|  202|
|        Greece|  173|
|         Spain|  147|
|       Belgium|  136|
|       Finland|  106|
|   Netherlands|   95|
+--------------+-----+



In [7]:
extras.groupBy('category').count().sort('count', ascending=False).limit(10).show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|                null|21878|
|Diversified Machi...|  304|
|  Money Center Banks|  224|
|Wireless Communic...|  190|
|    Asset Management|  158|
|       Biotechnology|  158|
|Diversified Utili...|  150|
|Technical & Syste...|  140|
|Information Techn...|  129|
|   Business Services|  120|
+--------------------+-----+



In [8]:
extras.toPandas().to_csv('../datasets/extras_eur.csv.gz', index=False)

# Splits

In [20]:
splits = (session
          .read.csv('../datasets/splits.csv.gz', header=True, inferSchema=True)
          .withColumn('ratio', F.col('numerator')/F.col('denominator'))
          .withColumn('dateonly', F.from_unixtime('date', 'yyyy-MM-dd'))
          .withColumn('unixtime', F.unix_timestamp('dateonly', 'yyyy-MM-dd'))
          .select('unixtime', 'ticker', 'ratio')
          .join(extras, 'ticker', 'leftsemi')
          .sort('unixtime')
          .cache())

print(splits.count())

splits.show()

1995
+-------+----------+------------------+
| ticker|  unixtime|             ratio|
+-------+----------+------------------+
|  1WR.F|1409176800|               2.0|
| S7F1.F|1409608800|               5.0|
|UCA1.MU|1409781600|               0.1|
|UCA1.DU|1409781600|               0.1|
|UCA1.BE|1409781600|               0.1|
| UCA1.F|1409781600|               0.1|
|UCA1.HM|1409781600|               0.1|
| 3U6.BE|1410127200|               1.5|
|  3U6.F|1410127200|               1.5|
|  CH5.F|1410213600|               1.5|
|KE0A.BE|1410386400|               4.0|
|  VGR.F|1410386400|               1.0|
| FII.PA|1410472800|               5.0|
|KE0A.BE|1410472800|               4.0|
| HIN.BE|1410472800|             0.125|
|  LI3.F|1410472800|               5.0|
| LI3.SG|1410472800|               5.0|
| HIN.MU|1410472800|             0.125|
|  HIN.F|1410472800|             0.125|
|L1OA.DE|1411336800|0.3333333333333333|
+-------+----------+------------------+
only showing top 20 rows



In [21]:
splits.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- unixtime: long (nullable = true)
 |-- ratio: double (nullable = true)



In [22]:
splits.summary().toPandas()

Unnamed: 0,summary,ticker,unixtime,ratio
0,count,1995,1995.0,1995.0
1,mean,,1485416719.3984962,54.266863869879785
2,stddev,,44188535.10617533,2239.2273773890897
3,min,01T.DU,1409176800.0,0.0
4,25%,,1442527200.0,1.0
5,50%,,1493848800.0,1.3
6,75%,,1521068400.0,2.0
7,max,ZU1.F,1566252000.0,100000.0


In [23]:
splits.toPandas().to_csv('../datasets/splits_eur.csv.gz', index=False)

# Dividends

In [13]:
divs = (session
        .read.csv('../datasets/dividends.csv.gz', header=True, inferSchema=True)
        .withColumn('dateonly', F.from_unixtime('date', 'yyyy-MM-dd'))
        .withColumn('unixtime', F.unix_timestamp('dateonly', 'yyyy-MM-dd'))
        .select('unixtime', 'ticker', 'amount')
        .join(extras, 'ticker', 'leftsemi')
        .sort('unixtime')
        .cache())

print(divs.count())

divs.limit(5).toPandas()

13950


Unnamed: 0,ticker,unixtime,amount
0,ASF.F,1409176800,0.19
1,ASF.BE,1409176800,0.19
2,BL8.SG,1409176800,0.13
3,BL8.DU,1409176800,0.13
4,BL8.BE,1409176800,0.13


In [14]:
divs.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- unixtime: long (nullable = true)
 |-- amount: double (nullable = true)



In [15]:
divs.summary().toPandas()

Unnamed: 0,summary,ticker,unixtime,amount
0,count,13950,13950.0,13950.0
1,mean,,1490413743.483871,8.299220277491038
2,stddev,,45893962.8144985,42.5627935860989
3,min,01T.DU,1409176800.0,0.0
4,25%,,1450652400.0,0.132
5,50%,,1491516000.0,0.353979
6,75%,,1529013600.0,3.0
7,max,ZU1.F,1566856800.0,2225.0


In [16]:
divs.toPandas().to_csv('../datasets/dividends_eur.csv.gz', index=False)

# Quotes

In [17]:
quotes = (session
        .read.csv('../datasets/quotes.csv.gz', header=True, inferSchema=True)
        .withColumn('dateonly', F.from_unixtime('date', 'yyyy-MM-dd'))
        .withColumn('unixtime', F.unix_timestamp('dateonly', 'yyyy-MM-dd'))
        .select('unixtime', 'ticker', 'close')
        .join(divs, 'ticker', 'leftsemi')
        .sort('unixtime')
        .cache())

print(quotes.count())

quotes.show()

1960393
+-------+----------+------------------+
| ticker|  unixtime|             close|
+-------+----------+------------------+
|  MT.AS|1409176800|22.260799407958984|
|BAYA.DE|1409176800|25.512500762939453|
| AZM.MI|1409176800|18.391599655151367|
| AGN.AS|1409176800|  5.96999979019165|
|BKIA.MC|1409176800| 5.863999843597412|
| SAB.MC|1409176800| 1.938789963722229|
|  PUM.F|1409176800|18.737199783325195|
| NEM.DE|1409176800| 6.581669807434082|
| MDF.MC|1409176800|25.820199966430664|
|  EZQ.F|1409176800|1.6920000314712524|
| DIC.DE|1409176800| 6.800000190734863|
| CPR.MI|1409176800|2.9049999713897705|
| COK.DE|1409176800| 16.63249969482422|
|BIO3.DE|1409176800|28.450000762939453|
| V3S.DE|1409176800|  2.34975004196167|
| UBK.HM|1409176800| 9.858050346374512|
|SRT3.DU|1409176800|22.165000915527344|
| NEM.DU|1409176800| 6.516670227050781|
|  MUM.F|1409176800| 5.814620018005371|
| MUM.DE|1409176800| 5.716060161590576|
+-------+----------+------------------+
only showing top 20 rows



In [18]:
quotes.summary().show()

+-------+-------+-------------------+--------------------+
|summary| ticker|           unixtime|               close|
+-------+-------+-------------------+--------------------+
|  count|1960393|            1960393|             1949872|
|   mean|   null|1.488570007751915E9|   27.45143757900416|
| stddev|   null|4.547710065372337E7|  102.15297854231297|
|    min| 01T.DU|         1409176800|9.999999747378752E-5|
|    25%|   null|         1449010800|   5.960999965667725|
|    50%|   null|         1488754800|   17.30500030517578|
|    75%|   null|         1528063200|  31.170000076293945|
|    max|  ZU1.F|         1566943200|   3925.699951171875|
+-------+-------+-------------------+--------------------+



In [19]:
quotes.toPandas().to_csv('../datasets/quotes_eur.csv.gz', index=False)