In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime
import matplotlib.pyplot as plt

from pyspark import SparkConf
from pyspark.sql import SparkSession # Spark SQL
from pyspark.sql.functions import date_format, to_date
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType

import pyspark.sql.functions as F
from pyspark.sql.functions import udf

master = "local[10]"
app_name = "Parallel Join"
spark_conf = SparkConf().setMaster(master).setAppName(app_name)#.set("spark.executor.memory", "6g")
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

pd.set_option('display.max_colwidth', None)

In [6]:
# Load Hamburg Data
file = "../datasets/hamburg/omm_export_tweets_01-06-2022_v2.csv"
data = spark.read.options(delimiter="\t").csv(file, header=True)

# Load CARDS Predictions
schema = StructType([
    StructField("id_cards", StringType(), True),
    StructField("cards_pred", StringType(), True),
    StructField("cards_pred_score", FloatType(), True)])

file = "../datasets/predictions"
predictions = spark.read.options(delimiter="|").csv(file, header=False, schema=schema)

# Load Waterloo+CARDS Predictions
schema = StructType([
    StructField("id_waterloo_cards", StringType(), True),
    StructField("waterloo_cards_pred", StringType(), True),
    StructField("waterloo_cards_score", FloatType(), True)])

file = "../datasets/predictions_waterloo_cards"
new_predictions = spark.read.options(delimiter="|").csv(file, header=False, schema=schema)

data = data.join(
    predictions, data.id ==  predictions.id_cards, "inner"
).join(
    new_predictions, data.id ==  new_predictions.id_waterloo_cards, "inner"
).drop(
    "id_cards", "id_waterloo_cards"
)

@udf(returnType=TimestampType())
def generateDate(year, month, day):
    if not year or not month or not day:
        return
    try:
        date = year + "-" + month + "-" + day
        return datetime.strptime(date,"%Y-%m-%d") 
    except:
        return

data = (
    data.withColumn("year", data.year.cast(StringType()))
    .withColumn("month", data.month.cast(StringType()))
    .withColumn("day", data.day.cast(StringType()))
)

data = (
    data.withColumn("date", generateDate(data.year, data.month, data.day))
)

data = data.withColumn("date", date_format("date", "yyyy-MM-dd HH:mm:ss"))
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- username: string (nullable = true)
 |-- fulltext: string (nullable = true)
 |-- 140_char_text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- url: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- retweet_count: string (nullable = true)
 |-- favorite_count: string (nullable = true)
 |-- language: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lon: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_time_zone: string (nullable = true)
 |-- user_follower_count: string (nullable = true)
 |-- user_favorite_count: string (nullable = true)
 |-- user_tweet_count: string (nullable = true)
 |-- user_description: string (nullable = true)
 |-- media: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: 