In [1]:
# I could not install Spark on local PyCharm application. 
# That's why I am using Colab as you already suggested in the lecture.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install specific Java and Spark for Python.
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version
!pip install pyspark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.180% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.180% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Waiting for h                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Waiting for h                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [3 InRelease 1                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/u

In [3]:
# Check to see if I installed Spark correctly or not.
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sc

In [7]:
# Inspect dollar dataset.
dlRDD = sc.textFile("/content/drive/MyDrive/Colab Notebooks/BDP/Datasets-20210410/DollarDataset.txt")
print(dlRDD.collect())

# After the data inspection, it is realized that words in each line are seperated by tab (\t) and 
# there are same missing vales. Assuming that these values are not existed only when the stock market is closed in special days (like weekend, ramadan feast, etc.)
# So, we can eliminate them and continue our calculation.

# Note that at the beginning of 2005, six zeros were dropped from the turkish liras. But, this will cause huge decrease in dolar, not any increase.
# Thus, I will not process dolar data for these extra zeros.

def split_and_process_values(line):
  arry = line.split("\t")
  date = arry[1]
  dolr = arry[2]
  # Check non-empth dolar values
  if len(arry[2]) > 0:
    # I got error when trying flaot("276.210,00") since the value must not inclued "." and "," must be replaced with ".". Eg: float("276210.00")
    dolr = dolr.replace(".", "")
    dolr = dolr.replace(",", ".")
    dolr = float(dolr)
    return (date, dolr)
  else:
    return (date, "None")

dlRDD = dlRDD.map(lambda x: split_and_process_values(x))
dlRDD = dlRDD.filter(lambda x: x[1] != "None") # filter None values where data is missing
print(dlRDD.take(105))

dlRDD1 = dlRDD.zipWithIndex() # indexing
dlRDD1 = dlRDD1.map(lambda x: (x[1], x[0])) # change the order
print(dlRDD1.take(10))

dlRDD2 = dlRDD.zipWithIndex() # indexing
dlRDD2 = dlRDD2.map(lambda x: (x[1] + 1, x[0])) # +1 indexing with order change
print(dlRDD2.take(10))

combRDD = dlRDD1.join(dlRDD2) # join by key
combRDD = combRDD.sortBy(lambda x: x[0], ascending=True) # sort by index
print(combRDD.take(5))

['1\t02-01-1950\t2,80', '2\t03-01-1950\t2,80', '3\t04-01-1950\t2,80', '4\t05-01-1950\t2,80', '5\t06-01-1950\t2,80', '6\t09-01-1950\t2,80', '7\t10-01-1950\t2,80', '8\t11-01-1950\t2,80', '9\t12-01-1950\t2,80', '10\t13-01-1950\t2,80', '11\t16-01-1950\t2,80', '12\t17-01-1950\t2,80', '13\t18-01-1950\t2,80', '14\t19-01-1950\t2,80', '15\t20-01-1950\t2,80', '16\t23-01-1950\t2,80', '17\t24-01-1950\t2,80', '18\t25-01-1950\t2,80', '19\t26-01-1950\t2,80', '20\t27-01-1950\t2,80', '21\t30-01-1950\t2,80', '22\t31-01-1950\t2,80', '23\t01-02-1950\t2,80', '24\t02-02-1950\t2,80', '25\t03-02-1950\t2,80', '26\t06-02-1950\t2,80', '27\t07-02-1950\t2,80', '28\t08-02-1950\t2,80', '29\t09-02-1950\t2,80', '30\t10-02-1950\t2,80', '31\t13-02-1950\t2,80', '32\t14-02-1950\t2,80', '33\t15-02-1950\t2,80', '34\t16-02-1950\t2,80', '35\t17-02-1950\t2,80', '36\t20-02-1950\t2,80', '37\t21-02-1950\t2,80', '38\t22-02-1950\t2,80', '39\t23-02-1950\t2,80', '40\t24-02-1950\t2,80', '41\t27-02-1950\t2,80', '42\t28-02-1950\t2,80', 

In [8]:
# I just write my code inside lambda: 100*(nextday_dolar_value-previous_day_dolar_value)/previous_day_dolar_value
combRDD = combRDD.map(lambda x: (x[1][0][0], 100*(x[1][0][1] - x[1][1][1])/x[1][1][1] ) ) # keep the corresponding data and percentage daily increase
combRDD = combRDD.sortBy(lambda x: x[1], ascending=False) # sort by daily percentage increase as descending
print("Top 5 greatest daily increase (by percentage) in dollar between 1950 and 2018:")
print(combRDD.take(5))

Top 5 greatest daily increase (by percentage) in dollar between 1950 and 2018:
[('22-08-1960', 221.42857142857144), ('25-01-1980', 100.0), ('10-08-1970', 65.0), ('23-02-2001', 39.75657690281898), ('06-04-1994', 38.88985856101813)]


In [None]:
# Hocam bence Türkiye'nin serbest piyasaya girdiği tarih olan 25 Ocak 1980'den itibaren bu incelemeyi yapmamız daha doğru olabilir. Çünkü o zamana kadar dolar değeri devlet tarafından discrete olarak belirleniyor.