**We filter and analyze our datasets based on the following critera:**

- we have meta information about the ticker
- we limit our analysis to stocks in euro
- we focus on the close market value
    - *once analysts process events*



In [1]:
import findspark
findspark.init()

In [2]:
import pandas as pd

import pyspark.sql.functions as F

from pyspark.sql import SparkSession

In [3]:
pd.options.display.max_columns = None

In [5]:
session = (SparkSession
           .builder
           .master('local[*]')
           .appName('analysis')
           .config('spark.driver.memory', '12g')
           .config('spark.executor.memory', '12g')
           .config('spark.driver.maxResultSize', '12g')
           .getOrCreate())

# Extras

In [6]:
extras = (session.read
          .csv('../datasets/extras.csv.gz', header=True, inferSchema=True)
          .filter('currency == "EUR"').dropDuplicates(['ticker'])
          .select('ticker', 'name', 'country', 'category name')
          .withColumnRenamed('category name', 'category')
          .cache())

print(extras.count())

extras.limit(5).toPandas()

26976


Unnamed: 0,ticker,name,country,category
0,0BN.BE,"HABIT RESTAURANT.A DL-,01",Germany,
1,0O2W.L,GFT Technologies SE,United Kingdom,
2,13J.F,Jinmao (China) Hotel Investments and Managemen...,France,
3,1TT.BE,FACTOR THERAPEUT.,Germany,
4,233.F,Springland International Holdings Limited,France,


In [7]:
extras.toPandas().describe(include='all')

Unnamed: 0,ticker,name,country,category
count,26976,26843,26774,5090
unique,26976,19071,16,131
top,QIA.DU,Unilever N.V.,Germany,Diversified Machinery
freq,1,8,16610,304


In [8]:
extras.groupBy('country').count().sort('count', ascending=False).limit(20).toPandas()

Unnamed: 0,country,count
0,Germany,16610
1,France,8668
2,Italy,396
3,United Kingdom,239
4,,202
5,Greece,173
6,Spain,147
7,Belgium,136
8,Finland,106
9,Netherlands,95


In [9]:
extras.groupBy('category').count().sort('count', ascending=False).limit(20).toPandas()

Unnamed: 0,category,count
0,,21886
1,Diversified Machinery,304
2,Money Center Banks,224
3,Wireless Communications,190
4,Biotechnology,158
5,Asset Management,158
6,Diversified Utilities,150
7,Technical & System Software,140
8,Information Technology Services,129
9,Business Services,120


In [10]:
extras.toPandas().to_csv('../datasets/extras_eur.csv.gz', index=False)

# Dividends

In [11]:
divs = (session
        .read.csv('../datasets/dividends.csv.gz', header=True, inferSchema=True)
        .withColumn('datetime', F.from_unixtime('date'))
        .join(extras, 'ticker', 'leftsemi')
        .drop('_c0', 'date')
        .cache())

print(divs.count())

divs.limit(5).toPandas()

152916


Unnamed: 0,ticker,amount,datetime
0,ADS.DE,2.6,2018-05-10 09:00:00
1,ADS.DE,1.5,2015-05-08 09:00:00
2,ADS.DE,1.6,2016-05-13 09:00:00
3,ADS.DE,2.0,2017-05-12 09:00:00
4,ADS.DE,2.6,2018-05-11 09:00:00


In [12]:
divs.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- datetime: string (nullable = true)



In [13]:
divs.describe().toPandas()

Unnamed: 0,summary,ticker,amount,datetime
0,count,152916,152916.0,152916
1,mean,219.67045454545453,7.05966664177752,
2,stddev,232.7538109108129,132.45180480982657,
3,min,016.BE,2.5e-05,2014-08-26 08:00:00
4,max,ZZMS.SG,10560.0,2019-08-26 09:00:00


In [14]:
divs.toPandas().to_csv('../datasets/dividends_eur.csv.gz', index=False)

# Indicators

In [15]:
quotes = (session
        .read.csv('../datasets/quotes.csv.gz', header=True, inferSchema=True)
        .withColumn('datetime', F.from_unixtime('date'))
        .select('ticker', 'close', 'datetime')
        .join(extras, 'ticker', 'leftsemi'))

print(quotes.count())

quotes.limit(5).toPandas()

20406739


Unnamed: 0,ticker,close,datetime
0,ADS.DE,59.450001,2014-08-26 09:00:00
1,ADS.DE,58.880001,2014-08-27 09:00:00
2,ADS.DE,57.419998,2014-08-28 09:00:00
3,ADS.DE,57.040001,2014-08-29 09:00:00
4,ADS.DE,57.360001,2014-09-01 09:00:00


In [16]:
quotes.describe().toPandas()

Unnamed: 0,summary,ticker,close,datetime
0,count,20406739,20240509.0,20406739
1,mean,239.90963511972635,66.36726410930955,
2,stddev,193.58065405941625,1019.1260207068376,
3,min,016.BE,0.0,2014-08-26 08:00:00
4,max,ZZMS.SG,75600.0,2019-08-26 16:40:22


In [18]:
quotes.toPandas().to_csv('../datasets/quotes_eur.csv.gz', index=False)