In [1]:
import sys
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.window import Window 
from pyspark.sql.types import DateType

os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["YARN_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf/"

In [2]:
spark = SparkSession.builder \
                    .master("yarn") \
                    .appName("Project_7_2") \
                    .getOrCreate()

24/09/03 10:50:03 WARN Utils: Your hostname, fhmugce59tbv43vsp0ua resolves to a loopback address: 127.0.1.1; using 172.16.0.3 instead (on interface eth0)
24/09/03 10:50:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/03 10:50:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/03 10:50:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/03 10:50:06 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# функция для чтения файла "/user/denis19/data/geo/cities/actual/geo.csv" и переобразования
def geo_transform(geo_path: str, sql) -> DataFrame:
    geo_transform_df = (sql.read.option("header", True)
            .option("delimiter", ";")
            .csv(geo_path)
            .withColumn("lat_g", F.regexp_replace("lat", ",", ".").cast('float'))
            .withColumn("lng_g", F.regexp_replace("lng", ",", ".").cast('float'))
            .drop("lat", "lng")
            .persist()
            )
    return geo_transform_df

In [4]:
# Test
geo_transform_df = geo_transform("/user/denis19/data/geo/cities/actual/geo.csv", spark)
geo_transform_df.show()

                                                                                

+---+----------+--------+--------+
| id|      city|   lat_g|   lng_g|
+---+----------+--------+--------+
|  1|    Sydney| -33.865|151.2094|
|  2| Melbourne|-37.8136|144.9631|
|  3|  Brisbane|-27.4678|153.0281|
|  4|     Perth|-31.9522|115.8589|
|  5|  Adelaide|-34.9289|138.6011|
|  6|Gold Coast|-28.0167|   153.4|
|  7|Cranbourne|-38.0996|145.2834|
|  8|  Canberra|-35.2931|149.1269|
|  9| Newcastle|-32.9167|  151.75|
| 10|Wollongong|-34.4331|150.8831|
| 11|   Geelong|  -38.15|  144.35|
| 12|    Hobart|-42.8806| 147.325|
| 13|Townsville|-19.2564|146.8183|
| 14|   Ipswich|-27.6167|152.7667|
| 15|    Cairns|-16.9303|145.7703|
| 16| Toowoomba|-27.5667|  151.95|
| 17|    Darwin|-12.4381|130.8411|
| 18|  Ballarat|  -37.55|  143.85|
| 19|   Bendigo|  -36.75|144.2667|
| 20|Launceston|-41.4419| 147.145|
+---+----------+--------+--------+
only showing top 20 rows



In [5]:
# функция чтения "/user/master/data/geo/events" с переименованием стобцов lat на lat_e и lon на lon_e
def events_transform(events_path: str, sql) -> DataFrame:
    events_transform_df = (sql
                  .read.parquet(f'{events_path}')
                  .where('event_type = "message"')
                  .select("event.message_id", "event.message_from","event_type", "lat", "lon", "date")
                  .where('lat IS NOT NULL and lon IS NOT NULL')
                  .withColumnRenamed("lat", "lat_e")  # переименование lat на lat_e
                  .withColumnRenamed("lon", "lon_e")  # переименование lon на lon_e
                  .persist()
                  )
    return events_transform_df

In [6]:
# Test
events_transform_df = events_transform("/user/master/data/geo/events", spark)
events_transform_df.show()

24/09/03 10:50:59 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
[Stage 6:>                                                          (0 + 1) / 1]

+----------+------------+----------+-------------------+------------------+----------+
|message_id|message_from|event_type|              lat_e|             lon_e|      date|
+----------+------------+----------+-------------------+------------------+----------+
|   1124422|       69134|   message|-16.046503973624315|145.82512282136813|2022-05-30|
|   1092067|      131912|   message|-36.858935432847716|145.43211674632613|2022-05-29|
|    643541|      117616|   message| -34.81927333434335| 138.8583508497529|2022-05-27|
|   1109058|      133551|   message| -34.70560960193741|149.36954006694884|2022-05-25|
|   1115074|       54215|   message| -16.71130585448509|146.15829631028848|2022-05-22|
|   1131184|      103235|   message| -33.94742527623598|151.32387878072961|2022-05-21|
|   1131046|      107808|   message|-11.539551427762717|131.17148068495302|2022-05-21|
|    311869|       85381|   message|-20.394696238744967|150.08215379975078|2022-05-20|
|    817590|       87739|   message|-27.626

                                                                                

In [7]:
def events_with_geo(events_transform_df: DataFrame, geo_transform_df: DataFrame) -> DataFrame:
    events_with_geo_df = (
        events_transform_df
        # при помощи кросс джоин происходит объединение двух датафреймов "events_df" и "geo_df"
        .crossJoin(geo_transform_df)
        # вычисление расстояния между координатами событий и географическими координатами и именуеься столбцом "distance"
        .withColumn("distance", F.lit(2) * F.lit(6371) * F.asin(
        F.sqrt(
            F.pow(F.sin((F.col("lat_e") - F.col("lat_g"))/F.lit(2)),2)
            + F.cos(F.col("lat_g"))*F.cos(F.col("lat_e"))*
            F.pow(F.sin((F.col("lon_e") - F.col("lng_g"))/F.lit(2)),2)
        )))
        .drop("lat_e","lon_e", "lat_g", "lng_g"))
    # при помощи оконной функции происходит выбор ближайщего географического объекта для каждого события, основываясь на минемальном расстоянии
    window = Window().partitionBy("message_id").orderBy(F.col("distance").asc())
    events_with_geo_df = (
        events_with_geo_df
        .withColumn("row_number", F.row_number().over(window))
        .filter(F.col("row_number")==1)
        .drop("row_number", "distance")
        # добавляется уникальный идентификатор "event_id" для каждого события
        .withColumn("event_id", F.monotonically_increasing_id())
        # выбор и переименование столбцов для удобства дальнейшей обработки
        .selectExpr("message_id", "message_from as user_id", "event_id", "event_type", "id as zone_id", "city", "date")
        .persist()
    )
    
    return events_with_geo_df

In [8]:
# Test
events_with_geo_df = events_with_geo(events_transform_df, geo_transform_df)
events_with_geo_df.show()



+----------+-------+--------+----------+-------+-----------+----------+
|message_id|user_id|event_id|event_type|zone_id|       city|      date|
+----------+-------+--------+----------+-------+-----------+----------+
|        26|  59169|       0|   message|     21|     Mackay|2022-01-20|
|       964|   1381|       1|   message|     19|    Bendigo|2022-04-11|
|      1950|  98393|       2|   message|      8|   Canberra|2022-01-24|
|      2453|  29427|       3|   message|      9|  Newcastle|2022-02-26|
|      2529|  19029|       4|   message|      3|   Brisbane|2022-02-21|
|      3091| 143033|       5|   message|      8|   Canberra|2022-01-07|
|      3506|  27162|       6|   message|     20| Launceston|2022-04-05|
|      3764|  26202|       7|   message|     19|    Bendigo|2022-01-04|
|      4590| 162869|       8|   message|     22|Rockhampton|2022-04-18|
|      4823|  43453|       9|   message|     19|    Bendigo|2022-01-31|
|      5385| 152640|      10|   message|      9|  Newcastle|2022

                                                                                

In [9]:
# Зоздать новую функцию
#def travel_calc(events_with_geo_df: DataFrame):
 # группировка данных по "user_id", считая количество путешествий "travel_count" и список городов "travel_array", которые посетил пользователь
    #travel_calc_df = (
        #events_with_geo_df
        #.groupBy("user_id")
        #.agg(
            #F.count("*").alias("travel_count"),
            #F.collect_list("city").alias("travel_array")
        #)
        # объединение агрегированных данных с исходным датафрейм
        #.join(events_with_geo_df, "user_id")
        #.selectExpr("user_id", "travel_count", "travel_array")
        #.persist()
    #)
    
    #return travel_calc_df 
    
def travel_calc(events_with_geo_df: DataFrame) -> DataFrame:
    # Группировка данных по "user_id", считая количество путешествий "travel_count" и список городов "travel_array", которые посетил пользователь
    travel_calc_df = (
        events_with_geo_df
        .groupBy("user_id")
        .agg(
            F.count("*").alias("travel_count"),
            F.collect_list("city").alias("travel_array")
        )
    )
    
    return travel_calc_df

In [10]:
# Test
travel_calc_df = travel_calc(events_with_geo_df)
travel_calc_df.show()



+-------+------------+--------------------+
|user_id|travel_count|        travel_array|
+-------+------------+--------------------+
|      1|           2|[Bunbury, Rockham...|
|      5|           6|[Hobart, Hobart, ...|
|      7|           2|[Melbourne, Melbo...|
|      9|           4|[Hobart, Cairns, ...|
|     17|           1|          [Canberra]|
|     25|           6|[Maitland, Maitla...|
|     26|           2|[Wollongong, Woll...|
|     27|           5|[Cranbourne, Woll...|
|     28|           1|            [Sydney]|
|     29|           1|        [Wollongong]|
|     31|           3|[Bunbury, Bunbury...|
|     32|           1|           [Geelong]|
|     34|           6|[Newcastle, Newca...|
|     43|           3|[Cranbourne, Cran...|
|     44|           2|   [Bendigo, Mackay]|
|     48|          95|[Maitland, Perth,...|
|     50|           1|            [Darwin]|
|     51|           1|           [Bendigo]|
|     52|           4|[Newcastle, Newca...|
|     54|           4|[Bendigo, 

                                                                                

In [35]:
def actual_geo(events_with_geo_df: DataFrame) -> DataFrame:
    # при помощи оконной функции происходит группировка строй по "user_id" и упорядочивает их по "data" в порядке убывания
    window = Window().partitionBy("user_id").orderBy(F.col("date").desc())
    actual_geo_df = (events_with_geo_df
            # добавление нового столбца "row_number", который присваивает каждой строке номер в пределах окна
            .withColumn("row_number", F.row_number().over(window))
            # оставляем строки "row_number" только с номером 1, то есть последние события для каждого пользователя
            .filter(F.col("row_number") == 1)
            .selectExpr("message_id", "user_id", "city", "zone_id") # убрал "travel_count", "travel_array"
            .persist()
           )
    return actual_geo_df

In [36]:
# Test
actual_geo_df = actual_geo(events_with_geo_df)
actual_geo_df.show()

+----------+-------+-----------+-------+
|message_id|user_id|       city|zone_id|
+----------+-------+-----------+-------+
|    693079|     26| Wollongong|     10|
|    727911|     29| Wollongong|     10|
|      4260|    474|  Toowoomba|     16|
|   1108716|    964|     Hobart|     12|
|   1133013|   1677|Rockhampton|     22|
|    806829|   1697|  Toowoomba|     16|
|    999257|   1806|     Mackay|     21|
|    718259|   2040|  Toowoomba|     16|
|    641269|   2214|   Canberra|      8|
|   1023768|   2250|    Geelong|     11|
|    128885|   2453|   Adelaide|      5|
|     51154|   2509| Cranbourne|      7|
|    324844|   2529|     Mackay|     21|
|    755145|   2927| Cranbourne|      7|
|    714293|   3506|     Sydney|      1|
|   1127802|   3764|     Hobart|     12|
|    384146|   5385|  Melbourne|      2|
|    667362|   5409|     Sydney|      1|
|   1119470|   6721|    Bendigo|     19|
|   1090923|   7279| Townsville|     13|
+----------+-------+-----------+-------+
only showing top

24/09/03 11:51:45 WARN CacheManager: Asked to cache already cached data.


In [37]:
def travel_geo(events_with_geo_df: DataFrame) -> DataFrame:
    # при омощи оконной функции группируеи строки "user_id" и "message_id", упорядочивая их по "data"
    window = Window().partitionBy("user_id", "message_id").orderBy(F.col("date"))
    travel_geo_df = (
        events_with_geo_df
        # добавляем новый столбец "dense_rank", который присваивает каждой строке ранг в пределах окна
        .withColumn("dense_rank", F.dense_rank().over(window))
        # добавляем новый столбец "date_diff", который вычисляет разницу в днях между датой события и преобразованным значением "dense_rank"
        .withColumn("date_diff", F.datediff(
            F.col('date').cast(DateType()), F.to_date(F.col("dense_rank").cast("string"), "d")
            )
        )
        .selectExpr("date_diff", "user_id as user", "date", "message_id", "zone_id")
        # группируем строки по "user", "date_diff", "zone_id", посчитывая количество событий "cnt_city" для каждой группы
        .groupBy("user", "date_diff", "zone_id")
        .agg(F.count(F.col("date")).alias("cnt_city"))
        .persist()
        )
    return travel_geo_df

In [38]:
# Test
travel_geo_df = travel_geo(events_with_geo_df)
travel_geo_df.orderBy(F.col("cnt_city").desc()).show()

24/09/03 11:52:07 WARN CacheManager: Asked to cache already cached data.

+-----+---------+-------+--------+
| user|date_diff|zone_id|cnt_city|
+-----+---------+-------+--------+
|62449|    19002|      8|     550|
|36000|    19032|      5|     464|
|62449|    19003|      8|     437|
|62449|    19004|      8|     425|
|14764|    19088|      8|     418|
|82914|    19008|      9|     376|
|36000|    19033|      5|     365|
|62449|    19005|      8|     363|
|14764|    19089|      8|     335|
|82914|    19009|      9|     323|
|36000|    19034|      5|     322|
|82914|    19007|      9|     312|
|14764|    19090|      8|     301|
|36000|    19035|      5|     284|
|62449|    19007|      8|     278|
|62449|    19006|      8|     262|
|82914|    19010|      9|     249|
|14764|    19091|      8|     248|
|36000|    19036|      5|     246|
|23807|    19056|     21|     240|
+-----+---------+-------+--------+
only showing top 20 rows



                                                                                

In [39]:
def home_geo(travel_geo_df: DataFrame) -> DataFrame:
    home_city_df = (
        travel_geo_df
        # оставляем строки, где "cnt_city" больше 27
        .filter((F.col("cnt_city") > 27))
        # добавляем новый столбец "max_dt", который содержит максимальное значение "date_diff" для каждого пользователя   
        .withColumn("max_dt", F.max(F.col("date_diff")).over(Window().partitionBy("user"))) 
        # оставляем только те строки, где "data_diff" равен "max_dt"
        .filter(F.col("date_diff") == F.col("max_dt"))       
        .persist()
    )
    return home_city_df

In [40]:
# Test
home_geo_df = home_geo(travel_geo_df)
home_geo_df.show()

24/09/03 11:52:25 WARN CacheManager: Asked to cache already cached data.


+------+---------+-------+--------+------+
|  user|date_diff|zone_id|cnt_city|max_dt|
+------+---------+-------+--------+------+
| 40989|    19005|     24|      48| 19005|
| 60039|    19079|      5|      37| 19079|
|117522|    19113|     23|      36| 19113|
| 74378|    19056|      1|      34| 19056|
| 12029|    19017|     21|      30| 19017|
| 21174|    19101|     19|      41| 19101|
|125598|    19079|      5|      29| 19079|
|   624|    19102|      7|      40| 19102|
| 83308|    19032|     22|      32| 19032|
| 21395|    19114|      9|      35| 19114|
| 33168|    19039|     21|      30| 19039|
|  4278|    19074|     21|      31| 19074|
| 84783|    19032|     10|      31| 19032|
|126334|    19120|     19|      32| 19120|
| 46233|    19116|     20|      31| 19116|
|110188|    19000|     21|      30| 19000|
| 57678|    18995|     22|      36| 18995|
| 52110|    19083|      2|      39| 19083|
| 15380|    19011|     19|      35| 19011|
| 62333|    19012|      8|      28| 19012|
+------+---

In [41]:
def merge_actual_and_home_geo(actual_geo_df: DataFrame, home_geo_df: DataFrame, geo_transform_df: DataFrame) -> DataFrame:
    geo_transform_df = geo_transform_df.select("id", "city")
    home_geo_df = (
        # объединяем "home_geo_df" с "geo_transform_df" по "zone_id" и "id", что бы получить название домашнего города "home_city". Затем выбираем столбцы "user" и "home_city"
        home_geo_df
        .join(geo_transform_df, home_geo_df.zone_id == geo_transform_df.id, "inner")
        .selectExpr("user", "city as home_city")
    )

    merge_actual_and_home_geo_df = (
    # объединяем "actual_geo_df" c "home_geo_df" по "user_id" и "user" соотвественно с использованем полного внешнего соединения. Затем выбираем нужные столбцы и переименовываем
       actual_geo_df
       .join(home_geo_df, actual_geo_df.user_id == home_geo_df.user, "fullouter")
       .selectExpr("user_id", "city as act_city", "home_city")  # Убрал поля "travel_count", "travel_array"
       .persist()
    )
    return merge_actual_and_home_geo_df

In [45]:
# Test
merge_actual_and_home_geo_df = merge_actual_and_home_geo(actual_geo_df, home_geo_df, geo_transform_df)
merge_actual_and_home_geo_df.filter(F.col("home_city").isNotNull()).show()

24/09/03 11:57:35 WARN CacheManager: Asked to cache already cached data.


+-------+-----------+-----------+
|user_id|   act_city|  home_city|
+-------+-----------+-----------+
|  40989|  Newcastle|    Bunbury|
|  60039|   Adelaide|   Adelaide|
| 117522|   Maitland|   Maitland|
|  74378|     Sydney|     Sydney|
|  12029|  Newcastle|     Mackay|
|  21174|    Bendigo|    Bendigo|
| 125598|   Adelaide|   Adelaide|
|    624| Cranbourne| Cranbourne|
|  83308|Rockhampton|Rockhampton|
|  21395| Wollongong|  Newcastle|
|  33168|    Geelong|     Mackay|
|   4278|     Mackay|     Mackay|
|  84783|    Geelong| Wollongong|
| 126334|    Bendigo|    Bendigo|
|  46233| Launceston| Launceston|
| 110188|     Mackay|     Mackay|
|  57678|  Newcastle|Rockhampton|
|  52110|  Melbourne|  Melbourne|
|  15380|    Bendigo|    Bendigo|
|  62333|   Canberra|   Canberra|
+-------+-----------+-----------+
only showing top 20 rows



In [52]:
def mart_users_cities(events_path: DataFrame, merge_actual_and_home_geo_df: DataFrame, travel_calc: DataFrame, sql) -> DataFrame:
    times = (
        # загружаем данные из паркет файла, фильтруем собятия типа "message" и выбираем нужные поля
        sql.read.parquet(f'{events_path}')
        .where('event_type = "message"')
        .selectExpr("event.message_from as user_id", "event.datetime", "event.message_id")
        .where("datetime IS NOT NULL")
    )
    # при помощи оконной функции группируем строки по "user_id" и упорядочиваем их по "datetime" в порядке убывания
    window = Window().partitionBy('user_id').orderBy(F.col('datetime').desc())

    times_w = (times
            # добавляем столбец "row_number", который присваивает каждой строки номер в пределах окна
            .withColumn("row_number", F.row_number().over(window))
            # оставляем только те строки с "row_number" равным 1 (последнее событие для каждого пользователя)
            .filter(F.col("row_number")==1)
            # преобразуем поле "datetime" в тип "Timestamp" и отбираем столбцы "user" и "Time"
            .withColumn("TIME", F.col("datetime").cast("Timestamp"))
            .selectExpr("user_id as user", "Time")
    )

    mart_users_cities_df = (
        # объединяем "merge_actual_and_home_geo_df" с "times_w" по "user_id" и "user" соотвественно
        merge_actual_and_home_geo_df
        .join(times_w, merge_actual_and_home_geo_df.user_id == times_w.user, "left").drop("user")
        # добавляем часовой пояс "timezone" при помощи объединения "Australia/" с названием фактического города "act_city"
        .withColumn("timezone",F.concat(F.lit("Australia/"),F.col("act_city")))
        # столбец с текущей меткой времени
        .withColumn("processed_dttm", F.current_timestamp())
        # добавляем столбец "local_time", который вычисляет локальное время. 
	    # Так как функция "from_utc_timestamp" содержит ограниченный список городов, вычисления местного времени будут проводиться не для всех городов.
        .withColumn("local_time", 
                    F.when( F.col("act_city")
                    .isin("Sydney", "Melbourne", "Brisbane", "Perth", "Adelaide", "Canberra", "Hobart", "Darwin"), 
                    F.from_utc_timestamp(
                    F.col("processed_dttm"), 
                    F.concat(F.lit("Australia/"), 
                    F.col("act_city"))))
                    .otherwise(None))
        .join(travel_calc, merge_actual_and_home_geo_df.user_id == travel_calc.user_id, "left")  # добавил соединение "travel_calc"
        .select(merge_actual_and_home_geo_df["user_id"], "act_city", "home_city", "travel_count", "travel_array", "local_time")
        .persist()
        )
    
    return mart_users_cities_df

In [53]:
# Test
mart_users_cities_df = mart_users_cities("/user/master/data/geo/events", merge_actual_and_home_geo_df, travel_calc_df, spark)
mart_users_cities_df.orderBy(F.col("home_city").desc()).show()



+-------+-----------+-----------+------------+--------------------+--------------------+
|user_id|   act_city|  home_city|travel_count|        travel_array|          local_time|
+-------+-----------+-----------+------------+--------------------+--------------------+
|  84783|    Geelong| Wollongong|         321|[Melbourne, Wollo...|                NULL|
|  33581| Wollongong| Wollongong|         273|[Wollongong, Woll...|                NULL|
| 152054|    Bendigo| Wollongong|         339|[Wollongong, Woll...|                NULL|
| 119895|Rockhampton| Wollongong|         432|[Wollongong, Woll...|                NULL|
| 136526| Wollongong| Wollongong|         249|[Adelaide, Wollon...|                NULL|
|   8523| Wollongong| Wollongong|         255|[Newcastle, Wollo...|                NULL|
|  10279| Townsville| Townsville|         806|[Townsville, Town...|                NULL|
| 131760|   Canberra| Townsville|         283|[Townsville, Town...|2024-09-03 22:11:...|
|  62337|  Newcastle|

24/09/03 13:18:05 WARN HeartbeatReceiver: Removing executor 2 with no recent heartbeats: 129430 ms exceeds timeout 120000 ms
24/09/03 13:18:05 ERROR YarnScheduler: Lost executor 2 on rc1a-dataproc-d-5w4oxa2s5b8foehs.mdb.yandexcloud.net: Executor heartbeat timed out after 129430 ms
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_341 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_126_169 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_92_100 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_147_36 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_76_32 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_566 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_1158 !
24/09/03 13:18:05 WARN BlockManagerMasterEndpoint: N

In [None]:
def main() -> None:
    events_path = sys.argv[1]
    geo_path = sys.argv[2]
    output_path = sys.argv[3]
    #events_path = "/user/master/data/geo/events/"
    #geo_path = "/user/denis19/data/geo/cities/actual/geo.csv"
    #output_path = "/user/denis19/analytics/showcase_by_users"

    conf = (SparkConf()
        .setAppName("showcase_recommendations_to_friends")
        .set("spark.executor.memory", "4g")
        .set("spark.driver.memory", "4g"))
    sc = SparkContext(conf=conf)
    sql = SQLContext(sc)

    geo_transform_df = geo_transform(geo_path, sql)
    events_transform_df = events_transform(events_path, sql)
    events_with_geo_df = events_with_geo(events_transform_df, geo_transform_df)
    actual_geo_df = actual_geo(events_with_geo_df)
    travel_geo_df = travel_geo(events_with_geo_df)
    home_geo_df = home_geo(travel_geo_df)
    merge_actual_and_home_geo_df = merge_actual_and_home_geo(actual_geo_df, home_geo_df, geo_transform_df)
    mart_users_cities_df = mart_users_cities(events_path, merge_actual_and_home_geo_df, sql)
    write = mart_users_cities_df.write.mode("overwrite").parquet(f'{output_path}')

    return write

In [None]:
if __name__ == "__main__":
        main()