# Exam example 1

## MCQ

1. [b] - since the blocks cannot contain pieces of different files, the 524 MB file needs to be divided into 2 blocks
2. [a] - 2 actions are performed on logsRDD (1 is actually on one of its descendants)

## Part 2

### Ex01

Structure of `prices.txt`:

- Sampling rate: 5 minutes
- Line format: `stockID,date,hour:minute,price`

Analyze behavior for year 2016 - how many times each stock appeared with a price > 10€?

Count in how many distinct dates each stock was > 10€; only keep the ones which appear >5 times.

Output file structure:

    stockID,n_dates_more_10

In [1]:
# file path
##### actually: 
# in_file = sys.argv[1]
in_file = '/data/students/bigdata_internet/exam_examples_data/example1_data/Prices.txt'

# out_file = sys.argv[2]
out_file = '/user/s315054/exam_sim/example1/p01'

In [2]:
!hdfs dfs -rm -r /user/s315054/exam_sim/example1/p01/.

23/02/02 20:57:11 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s315054/exam_sim/example1/p01' to trash at: hdfs://BigDataHA/user/s315054/.Trash/Current/user/s315054/exam_sim/example1/p01


In [3]:
in_DF = spark.read.load(in_file, format='csv', sep=',', header=False, inferSchema=True)\
        .withColumnRenamed('_c0', 'stockID')\
        .withColumnRenamed('_c1', 'date')\
        .withColumnRenamed('_c2', 'time')\
        .withColumnRenamed('_c3', 'price')
in_DF.show(5)

+-------+----------+-----+-----+
|stockID|      date| time|price|
+-------+----------+-----+-----+
|   FCAU|2016/06/20|16:10|10.43|
|   FCAU|2016/06/20|16:15|10.45|
|   FCAU|2016/06/20|16:20|10.53|
|   FCAU|2016/06/20|16:25| 11.0|
|   FCAU|2016/06/20|16:30|10.99|
+-------+----------+-----+-----+
only showing top 5 rows



In [4]:
# Remove elements which are not in 2016:
spark.udf.register('getYear', lambda date: date.split('/')[0])

df2016 = in_DF.select('stockID', 'date', 'price')\
                .filter('getYear(date) == 2016 AND price > 10')\
                .select('stockID', 'date')\
                .distinct()
# Only id, date and price of 2016 records with price > 10

In [5]:
df_more_10 = df2016.groupBy('stockID').agg({'*':'count'}).withColumnRenamed('count(1)', 'n_date_more_10')

In [6]:
df_more_10.write.csv(out_file, header=False)

                                                                                

In [7]:
df_more_10.show()

+-------+--------------+
|stockID|n_date_more_10|
+-------+--------------+
|   GOOG|             8|
|   FCAU|             7|
|   AMZN|             1|
+-------+--------------+



### Ex02

Select the highest price for each pair (stock, date) in 2016.

In [8]:
out_1 = '/user/s315054/exam_sim/example1/p02_a'
out_2 = '/user/s315054/exam_sim/example1/p02_b'

In [9]:
!hdfs dfs -rm -r /user/s315054/exam_sim/example1/p02_a

23/02/02 21:33:22 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s315054/exam_sim/example1/p02_a' to trash at: hdfs://BigDataHA/user/s315054/.Trash/Current/user/s315054/exam_sim/example1/p02_a


In [10]:
!hdfs dfs -rm -r /user/s315054/exam_sim/example1/p02_b

23/02/02 21:33:25 INFO fs.TrashPolicyDefault: Moved: 'hdfs://BigDataHA/user/s315054/exam_sim/example1/p02_b' to trash at: hdfs://BigDataHA/user/s315054/.Trash/Current/user/s315054/exam_sim/example1/p02_b


#### Part a

In [11]:
df_2016_only = in_DF.selectExpr('stockID', 'date', 'CAST(price AS float)')\
                    .filter('getYear(date) == 2016')

In [12]:
# Compute for each date the highest price, then sort
# Note: no need to transform date format - it works correctly
max_price_DF = df_2016_only.groupBy('stockID', 'date').agg({'price':'max'}).sort('stockID', 'date')

In [13]:
max_price_DF.write.csv(out_1, header=False)

In [14]:
max_price_DF.show()

+-------+----------+----------+
|stockID|      date|max(price)|
+-------+----------+----------+
|   AMZN|2016/01/04|      14.0|
|   FCAU|2016/06/20|      11.0|
|   FCAU|2016/06/21|      12.0|
|   FCAU|2016/07/20|      11.0|
|   FCAU|2016/07/21|      12.0|
|   FCAU|2016/12/01|      9.95|
|   FCAU|2016/12/02|       9.9|
|   FCAU|2016/12/03|      10.1|
|   FCAU|2016/12/04|      10.5|
|   FCAU|2016/12/05|     10.53|
|   GOOG|2016/01/01|      43.0|
|   GOOG|2016/01/02|      45.0|
|   GOOG|2016/01/03|      46.0|
|   GOOG|2016/01/04|      40.0|
|   GOOG|2016/06/20|      51.0|
|   GOOG|2016/06/21|     51.99|
|   GOOG|2016/07/20|     54.99|
|   GOOG|2016/07/21|     51.99|
+-------+----------+----------+



#### Part b

Positive weekly trend: difference between highest stock price of last day and the one of the first one is > 0.

In [115]:
# Start from the 2016 dataframe with max prices
from datetime import datetime

# Goal: stock IDs of ones having at least 'NW' p.w.t.

#NW = sys.argv[3]
NW = 1

# Add week numbers
spark.udf.register('getWeek', lambda date: datetime.strptime(date, '%Y/%m/%d').strftime('%V'))

week_DF = max_price_DF.withColumnRenamed('max(price)', 'max_price')\
            .selectExpr('stockID', 'date', 'getWeek(date) AS week', 'max_price')

23/01/24 11:54:53 WARN analysis.SimpleFunctionRegistry: The function getweek replaced a previously registered function.


In [113]:
# This part is NOT EFFICIENT
####################################################
# Get 1st day of the week by taking the min 'date' in each week
first_days_DF = week_DF.groupBy('stockID', 'week').agg({'date':'min'}).withColumnRenamed('min(date)', 'first')

# Get last day of the week by taking the max 'date' in each week
last_days_DF = week_DF.groupBy('stockID', 'week').agg({'date':'max'}).withColumnRenamed('max(date)', 'last')

# Now to get the prices join on the date and stockID
max_and_min_DF = first_days_DF.join(last_days_DF, ['stockID', 'week'], 'inner')

max_and_min_DF.show(5)
####################################################

+-------+----+----------+----------+
|stockID|week|     first|      last|
+-------+----+----------+----------+
|   FCAU|  25|2016/06/20|2016/06/21|
|   GOOG|  01|2016/01/04|2016/01/04|
|   FCAU|  49|2016/12/05|2016/12/05|
|   FCAU|  48|2016/12/01|2016/12/04|
|   GOOG|  53|2016/01/01|2016/01/03|
+-------+----+----------+----------+
only showing top 5 rows



In [116]:
week_DF.createOrReplaceTempView('table_weeks')

max_and_min_DF = spark.sql("""
                        SELECT stockID, week, max(date) AS last, min(date) AS first
                        FROM table_weeks
                        GROUP BY stockID, week""")

max_and_min_DF.show(5)

+-------+----+----------+----------+
|stockID|week|      last|     first|
+-------+----+----------+----------+
|   FCAU|  25|2016/06/21|2016/06/20|
|   GOOG|  01|2016/01/04|2016/01/04|
|   FCAU|  49|2016/12/05|2016/12/05|
|   FCAU|  48|2016/12/04|2016/12/01|
|   GOOG|  53|2016/01/03|2016/01/01|
+-------+----+----------+----------+
only showing top 5 rows



In [117]:
#### MY SOLUTION

max_and_min_prices = max_and_min_DF.join(week_DF.selectExpr('stockID', 'date AS first', 'max_price AS price_first'), ['stockID', 'first'])\
.join(week_DF.selectExpr('stockID', 'date AS last', 'max_price AS price_last'), ['stockID', 'last'])

max_and_min_prices.show(10)

+-------+----------+----------+----+-----------+----------+
|stockID|      last|     first|week|price_first|price_last|
+-------+----------+----------+----+-----------+----------+
|   AMZN|2016/01/04|2016/01/04|  01|       14.0|      14.0|
|   FCAU|2016/06/21|2016/06/20|  25|       11.0|      12.0|
|   FCAU|2016/07/21|2016/07/20|  29|       11.0|      12.0|
|   FCAU|2016/12/04|2016/12/01|  48|       9.95|      10.5|
|   FCAU|2016/12/05|2016/12/05|  49|      10.53|     10.53|
|   GOOG|2016/01/03|2016/01/01|  53|       43.0|      46.0|
|   GOOG|2016/01/04|2016/01/04|  01|       40.0|      40.0|
|   GOOG|2016/06/21|2016/06/20|  25|       51.0|     51.99|
|   GOOG|2016/07/21|2016/07/20|  29|      54.99|     51.99|
+-------+----------+----------+----+-----------+----------+



                                                                                

In [127]:
#### ACTUAL SOLUTION
max_and_min_DF.createOrReplaceTempView('max_min')

max_and_min_prices_2 = spark.sql(""" 
                                SELECT max_min.stockID, max_min.week, table_weeks.max_price AS price_first, max_min.last, max_min.first
                                FROM max_min, table_weeks
                                WHERE max_min.stockID == table_weeks.stockID AND max_min.first == table_weeks.date
                                """)

max_and_min_prices_2.createOrReplaceTempView('t1')

max_and_min_prices = spark.sql("""
                            SELECT t1.stockID, t1.week, table_weeks.max_price AS price_last, t1.price_first, t1.last, t1.first
                            FROM t1, table_weeks
                            WHERE t1.stockID == table_weeks.stockID AND t1.last == table_weeks.date
                            """)


max_and_min_prices.show(10)


+-------+----+----------+-----------+----------+----------+
|stockID|week|price_last|price_first|      last|     first|
+-------+----+----------+-----------+----------+----------+
|   AMZN|  01|      14.0|       14.0|2016/01/04|2016/01/04|
|   FCAU|  25|      12.0|       11.0|2016/06/21|2016/06/20|
|   FCAU|  29|      12.0|       11.0|2016/07/21|2016/07/20|
|   FCAU|  48|      10.5|       9.95|2016/12/04|2016/12/01|
|   FCAU|  49|     10.53|      10.53|2016/12/05|2016/12/05|
|   GOOG|  53|      46.0|       43.0|2016/01/03|2016/01/01|
|   GOOG|  01|      40.0|       40.0|2016/01/04|2016/01/04|
|   GOOG|  25|     51.99|       51.0|2016/06/21|2016/06/20|
|   GOOG|  29|     51.99|      54.99|2016/07/21|2016/07/20|
+-------+----+----------+-----------+----------+----------+



In [128]:
# Isolate records for pos. week. trends:
pwt_DF = max_and_min_prices.filter("price_first < price_last")

pwt_DF.show()

+-------+----+----------+-----------+----------+----------+
|stockID|week|price_last|price_first|      last|     first|
+-------+----+----------+-----------+----------+----------+
|   FCAU|  25|      12.0|       11.0|2016/06/21|2016/06/20|
|   FCAU|  29|      12.0|       11.0|2016/07/21|2016/07/20|
|   FCAU|  48|      10.5|       9.95|2016/12/04|2016/12/01|
|   GOOG|  53|      46.0|       43.0|2016/01/03|2016/01/01|
|   GOOG|  25|     51.99|       51.0|2016/06/21|2016/06/20|
+-------+----+----------+-----------+----------+----------+



In [129]:
# Count the n. of pwt for each stock, filter directly
selected_DF = pwt_DF.groupBy('stockID').agg({'*':'count'}).filter(f'count(1) >= {NW}')

In [130]:
selected_DF.show()

+-------+--------+
|stockID|count(1)|
+-------+--------+
|   GOOG|       2|
|   FCAU|       3|
+-------+--------+



In [110]:
selected_DF.select('stockID').write.csv(out_2, header=False)

                                                                                

In [111]:
### ok