# Data Pipeline

# Goal
Create a data pipeline for twitter challenge.

# Methodology
Create different features from the original data

## Sections
1. [**Requirements**](#Requirements)
2. [**Functions**](#Functions)
3. [**Inputs**](#Inputs)
4. [**Pipeline**](#Pipeline)
    - [**Indicators**](#Indicators)
    - [**Intention_features**](#Intention_features)
    - [**TopicEncodings**](#TopicEncodings)
    - [**EngagingFollowsEngaged**](#EngagingFollowsEngaged)
    - [**Hashtags**](#Hashtags)
    - [**Domain**](#Domain)
    - [**Language**](#Language)
    - [**Media**](#Media)
    - [**Links**](#Links)
    - [**Tweet_type**](#Tweet_type)
    - [**Timestamp_features**](#Timestamp_features)
    - [**Followers_and_Followings_features**](#Followers_and_Followings_features)
    - [**Quantile_Discretizer**](#Quantile_Discretizer)
    - [**Intentions_join**](#Intentions_join)
5. [**FeatureSelection**](#FeatureSelection)
6. [**Imputation**](#Imputation)
7. [**Validation**](#Validation)
8. [**Saving_df**](#Saving_df)

# Requirements

In [None]:
#installing packages
sc.install_pypi_package("pandas")
sc.install_pypi_package("boto3")

In [2]:
#reconfiguring SparkContext
sc.setCheckpointDir('hdfs:///twitter/checkpoints')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
import time
import os
import boto3
import gc
import sys
import numpy as np
import pandas as pd
import pickle
import pyspark
import subprocess
from pyspark.sql import SparkSession
from pyspark.sql.types import (FloatType, DateType, StructType, StructField, StringType, LongType, 
    IntegerType, ArrayType, BooleanType, DoubleType, DecimalType)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler, QuantileDiscretizer
gc.enable()

spark = SparkSession.builder.config("spark.sql.shuffle.partitions", 1000).appName("twitter").getOrCreate()
print(spark.sparkContext.getConf().get('spark.driver.memory'))
print(spark.sparkContext.getConf().get("spark.sql.shuffle.partitions"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2048M
1000

# Functions

## Preprocessing

In [4]:
def validator(df):
    columns_w_nan = {}
    for col in df.schema:
        null_count = df.filter(F.col(col.name).isNull()).count()
        if null_count>0:
            columns_w_nan[col.name]=null_count
    return columns_w_nan

def hdfs_exists(path):
    proc = subprocess.Popen(['hadoop', 'fs', '-test', '-e', path])
    proc.communicate()
    if proc.returncode != 0:
        print(f"{path} does not exist")
        return False
    else : 
        print(f"{path} exist")
        return True
    
def reduce_dec(v):
    result = np.around(v, decimals=4)
    return result.tolist()
reduce_dec_udf = F.udf(reduce_dec,  ArrayType(DoubleType()))

def to_array_(v):
    result = v.toArray()
    result = np.around(result, decimals=4)
    return result.tolist()
vect2array = F.udf(to_array_,  ArrayType(DoubleType()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def build_processed_schema(has_labels=True):
    if has_labels:
        schema = StructType([StructField('text_tokens_ors', StringType()),
                             StructField('tweet_id_id', StringType()),
                             StructField('engaged_with_user_id_id', StringType()),
                             StructField('engaged_with_user_is_verified_bool', BooleanType()),
                             StructField('engaging_user_id_id', StringType()),
                             StructField('engaging_user_is_verified_bool', BooleanType()),
                             StructField('engagee_follows_engager_bool', BooleanType()),
                             StructField('hashtagEncoded_unors', StringType()),
                             StructField('hashtagSumCount_ss_num', DoubleType()),
                             StructField('hashtagCount_ss_num', DoubleType()),
                             StructField('domainEncoded_unors', StringType()),
                             StructField('domainCount_ss_num', DoubleType()),
                             StructField('tweetEncoded_cat', IntegerType()),
                             StructField('languageEncoded_cat', StringType()),
                             StructField('tweet_timestamp_day_of_week_cat', StringType()),
                             StructField('tweet_timestamp_week_of_month_cat', StringType()),
                             StructField('tweet_timestamp_hour_cat', StringType()),
                             StructField('tweet_timestamp_to_engagee_account_creation_ss_num', DoubleType()),
                             StructField('tweet_timestamp_to_engaging_account_creation_ss_num', DoubleType()),
                             StructField('engaged_with_vs_engaging_follower_diff_log_ss_num', DoubleType()), 
                             StructField('engaged_with_vs_engaging_following_diff_log_ss_num', DoubleType()),
                             StructField('engaged_follow_diff_log_ss_num', DoubleType()),
                             StructField('engaging_follow_diff_log_ss_num', DoubleType()),
                             StructField('engaged_follower_diff_engaging_following_log_ss_num', DoubleType()),
                             StructField('engaged_following_diff_engaging_follower_log_ss_num', DoubleType()),
                             StructField('engaged_with_user_follower_count_log_ss_num', DoubleType()),
                             StructField('engaging_user_follower_count_log_ss_num', DoubleType()),
                             StructField('engaged_with_user_following_count_log_ss_num', DoubleType()),
                             StructField('engaging_user_following_count_log_ss_num', DoubleType()),
                             StructField('PhotoCount_ss_num', DoubleType()),
                             StructField('VideoCount_ss_num', DoubleType()),
                             StructField('GIFCount_ss_num', DoubleType()),
                             StructField('linkCount_ss_num', DoubleType()),
                             StructField('engaged_with_user_follower_count_q_cat', DoubleType()),
                             StructField('engaged_with_user_following_count_q_cat', DoubleType()),
                             StructField('engaged_with_user_account_creation_q_cat', DoubleType()),
                             StructField('engaging_user_follower_count_q_cat', DoubleType()),
                             StructField('engaging_user_following_count_q_cat', DoubleType()),
                             StructField('engaging_user_account_creation_q_cat', DoubleType()),
                             StructField('total_appearance_ss_num', DoubleType()),
                             StructField('perc_n_interactions_ss_num', DoubleType()),
                             StructField('perc_n_commented_ss_num', DoubleType()),
                             StructField('perc_n_liked_ss_num', DoubleType()),
                             StructField('perc_n_replied_ss_num', DoubleType()),
                             StructField('perc_n_retweeted_ss_num', DoubleType()),
                             StructField('indicator_reply', IntegerType()),
                             StructField('indicator_retweet', IntegerType()),
                             StructField('indicator_retweet_with_comment', IntegerType()),
                             StructField('indicator_like', IntegerType()),
                             StructField('indicator_interaction', IntegerType()),
                             StructField('engaged_with_user_id_bucket', IntegerType()),
                             StructField('engaging_user_id_bucket', IntegerType())])
    else:
        schema = StructType([StructField('text_tokens_ors', StringType()),
                         StructField('tweet_id_id', StringType()),
                         StructField('engaged_with_user_id_id', StringType()),
                         StructField('engaged_with_user_is_verified_bool', BooleanType()),
                         StructField('engaging_user_id_id', StringType()),
                         StructField('engaging_user_is_verified_bool', BooleanType()),
                         StructField('engagee_follows_engager_bool', BooleanType()),
                         StructField('hashtagEncoded_unors', StringType()),
                         StructField('hashtagSumCount_ss_num', DoubleType()),
                         StructField('hashtagCount_ss_num', DoubleType()),
                         StructField('domainEncoded_unors', StringType()),
                         StructField('domainCount_ss_num', DoubleType()),
                         StructField('tweetEncoded_cat', IntegerType()),
                         StructField('languageEncoded_cat', StringType()),
                         StructField('tweet_timestamp_day_of_week_cat', StringType()),
                         StructField('tweet_timestamp_week_of_month_cat', StringType()),
                         StructField('tweet_timestamp_hour_cat', StringType()),
                         StructField('tweet_timestamp_to_engagee_account_creation_ss_num', DoubleType()),
                         StructField('tweet_timestamp_to_engaging_account_creation_ss_num', DoubleType()),
                         StructField('engaged_with_vs_engaging_follower_diff_log_ss_num', DoubleType()), 
                         StructField('engaged_with_vs_engaging_following_diff_log_ss_num', DoubleType()),
                         StructField('engaged_follow_diff_log_ss_num', DoubleType()),
                         StructField('engaging_follow_diff_log_ss_num', DoubleType()),
                         StructField('engaged_follower_diff_engaging_following_log_ss_num', DoubleType()),
                         StructField('engaged_following_diff_engaging_follower_log_ss_num', DoubleType()),
                         StructField('engaged_with_user_follower_count_log_ss_num', DoubleType()),
                         StructField('engaging_user_follower_count_log_ss_num', DoubleType()),
                         StructField('engaged_with_user_following_count_log_ss_num', DoubleType()),
                         StructField('engaging_user_following_count_log_ss_num', DoubleType()),
                         StructField('PhotoCount_ss_num', DoubleType()),
                         StructField('VideoCount_ss_num', DoubleType()),
                         StructField('GIFCount_ss_num', DoubleType()),
                         StructField('linkCount_ss_num', DoubleType()),
                         StructField('engaged_with_user_follower_count_q_cat', DoubleType()),
                         StructField('engaged_with_user_following_count_q_cat', DoubleType()),
                         StructField('engaged_with_user_account_creation_q_cat', DoubleType()),
                         StructField('engaging_user_follower_count_q_cat', DoubleType()),
                         StructField('engaging_user_following_count_q_cat', DoubleType()),
                         StructField('engaging_user_account_creation_q_cat', DoubleType()),
                         StructField('total_appearance_ss_num', DoubleType()),
                         StructField('perc_n_interactions_ss_num', DoubleType()),
                         StructField('perc_n_commented_ss_num', DoubleType()),
                         StructField('perc_n_liked_ss_num', DoubleType()),
                         StructField('perc_n_replied_ss_num', DoubleType()),
                         StructField('perc_n_retweeted_ss_num', DoubleType()),
                         StructField('engaged_with_user_id_bucket', IntegerType()),
                         StructField('engaging_user_id_bucket', IntegerType())])
    return schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Inputs

In [6]:
dictionary_size={"final-complete": {"val_size": 500000, 
                                    "train_size": "all"}}

training = False
submission = False
test = True

bucket='bucket-name'
s3_resource = boto3.resource('s3')
top_k_languages = 30
top_k_domains = 3000
top_k_hashtags = 13000

# Embeddings
num_partitions=1000

# Buckets
partition_per_cluster = 100

suffix_sample = "final-complete" #"full", "small", "medium", "sub_medium"
data_path = "final-data"
object_paths = "final-artifacts"

val_size = dictionary_size[suffix_sample]["val_size"]
train_size = dictionary_size[suffix_sample]["train_size"]

bucket_s3 = s3_resource.Bucket(bucket)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**Paths**

In [7]:
#S3
twitter_bucket_s3 = "s3a://bucket-name"
trainining_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "training.tsv")
submission_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "submission.tsv")
test_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "test.tsv")

# Splitted paths
train_path = os.path.join(twitter_bucket_s3, data_path, "train-"+suffix_sample)
val_path = os.path.join(twitter_bucket_s3, data_path, "val-"+suffix_sample)

# Processed
processed_train_path = os.path.join(twitter_bucket_s3, data_path, "processed", "train-"+suffix_sample)
processed_val_path = os.path.join(twitter_bucket_s3, data_path, "processed", "val-"+suffix_sample)
processed_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed", "submission-"+suffix_sample)
processed_test_path = os.path.join(twitter_bucket_s3, data_path, "processed", "test-"+suffix_sample)
processed_emb_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                        "train-"+suffix_sample)
processed_emb_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                      "val-"+suffix_sample)
processed_emb_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "submission-"+suffix_sample)
processed_emb_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "test-"+suffix_sample)
processed_top_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                        "train-"+suffix_sample)
processed_top_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                      "val-"+suffix_sample)
processed_top_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                             "submission-"+suffix_sample)
processed_top_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                             "test-"+suffix_sample)
# Resources
engaging_users_training_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-training")
engaging_users_submission_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-submission")
engaging_users_test_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-test")
intentions_path = os.path.join(twitter_bucket_s3, data_path, "intentions-"+suffix_sample)
map_user_bucket_path = os.path.join(twitter_bucket_s3, data_path, "map_user_bucket")

topic_encodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "user_topics")
users_intime_path = os.path.join(twitter_bucket_s3, data_path, "users_intime-"+suffix_sample)

# keys objects
key_hashtag_mapping = os.path.join(object_paths, f'hashtag_mapping_{suffix_sample}.pkl')
key_domain_mapping = os.path.join(object_paths, f'domain_mapping_{suffix_sample}.pkl')
key_language_mapping = os.path.join(object_paths, f'language_mapping_{suffix_sample}.pkl')
key_hashtag_count = os.path.join(object_paths, f'hashtag_count_{suffix_sample}.pkl')
key_domain_count = os.path.join(object_paths, f'domain_count_{suffix_sample}.pkl')
key_scaling_features = os.path.join(object_paths, f'scaling_dictionary_{suffix_sample}.pkl')
key_diff_min = os.path.join(object_paths, f'diff_min_{suffix_sample}.pkl')
key_impute_perc = os.path.join(object_paths, f'dict_mean_perc_{suffix_sample}.pkl')

# s3+keys
columns = ["engaged_with_user_follower_count", "engaged_with_user_following_count",
           "engaged_with_user_account_creation", "engaging_user_follower_count",
           "engaging_user_following_count", "engaging_user_account_creation"]
qds_paths = {}
for col in columns:
    qds_paths[col] = os.path.join(twitter_bucket_s3, object_paths, f"qs_{suffix_sample}_" + col)
    
# Bucket pipeline
users_buckets = os.path.join(twitter_bucket_s3, data_path, "users_buckets") #
users_buckets_part_2 = os.path.join(twitter_bucket_s3, data_path, "users_buckets_part_2") #

pipeline_kmeans_path = os.path.join(twitter_bucket_s3, object_paths, "pipeline_id_encoding")
cluster_map_path = os.path.join(twitter_bucket_s3, data_path, "cluster_map")

# Embeddings
bert_embeddings_train = os.path.join(twitter_bucket_s3, "data", "textEncodings", "tweets_extended")
submission_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "submission-tweets-extended")
test_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "test-tweets-extended")

# Topics pipeline
reduced_topics_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "reducedTopics")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load_data

hadoop fs -rm -r filename

In [8]:
current_path = "hdfs:///current-df"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
if submission:
    print("Submission")
    df = spark.read.option("header","true").csv(processed_submission_path, 
                                                schema=build_processed_schema(has_labels=False))
elif test:
    print("Test")
    df = spark.read.option("header","true").csv(processed_test_path, 
                                                schema=build_processed_schema(has_labels=False))
else:
    if training:
        print("Train")
        df = spark.read.option("header","true").csv(processed_train_path, 
                                                    schema=build_processed_schema(has_labels=True))
    else:
        print("Valid")
        df = spark.read.option("header","true").csv(processed_val_path, 
                                                    schema=build_processed_schema(has_labels=True))
    
df = df.withColumn("hash_tweet_id", F.abs(F.hash("tweet_id_id")%num_partitions))
df = df.repartition("hash_tweet_id")
df.repartition(F.col("hash_tweet_id")).write.option("header","true").partitionBy("hash_tweet_id")\
            .mode("overwrite").csv(current_path)
del df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test

In [10]:
df = spark.read.option("header","true").csv(current_path).repartition("hash_tweet_id")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
df.rdd.getNumPartitions()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1000

In [12]:
df.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

12434838

# Loading BERT Embeddings

In [13]:
test_path_hdfs = "hdfs:///test-embeddings"
submission_path_hdfs = "hdfs:///submission-embeddings"
tweet_embeddings_path_hdfs = "hdfs:///tweet-embeddings"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
if submission:
    if not(hdfs_exists(submission_path_hdfs)):
        print("Creating submission_path_hdfs")
        submission_embeddings = spark.read.parquet(submission_rawTweetEncodings_path)
        submission_embeddings = submission_embeddings.drop("tweet_hash", "pcs")
        submission_embeddings = submission_embeddings.withColumn("embedding", vect2array(F.col("embedding")))
        submission_embeddings = submission_embeddings.withColumn("embedding", F.col("embedding").cast(StringType()))
        submission_embeddings = submission_embeddings.withColumn("hash_tweet_id_1",
                                                         F.abs(F.hash("tweet_id")%num_partitions))
        submission_embeddings.repartition(F.col("hash_tweet_id_1")).write.partitionBy("hash_tweet_id_1")\
                    .mode("overwrite").csv(submission_path_hdfs)
        print(submission_embeddings)
        del submission_embeddings
    else:
        print("Already exists submission_path_hdfs")
elif test:
    if not(hdfs_exists(test_path_hdfs)):
        print("Creating submission_path_hdfs")
        test_embeddings = spark.read.parquet(test_rawTweetEncodings_path)
        test_embeddings = test_embeddings.drop("tweet_hash", "pcs")
        test_embeddings = test_embeddings.withColumn("embedding", vect2array(F.col("embedding")))
        test_embeddings = test_embeddings.withColumn("embedding", F.col("embedding").cast(StringType()))
        test_embeddings = test_embeddings.withColumn("hash_tweet_id_1",
                                                         F.abs(F.hash("tweet_id")%num_partitions))
        test_embeddings.repartition(F.col("hash_tweet_id_1")).write.partitionBy("hash_tweet_id_1")\
                    .mode("overwrite").csv(test_path_hdfs)
        print(test_embeddings)
        del test_embeddings
    else:
        print("Already exists test_path_hdfs")
else:
    if not(hdfs_exists(tweet_embeddings_path_hdfs)):
        print("Creating tweets_embeddings")
        tweets_embeddings = spark.read.parquet(bert_embeddings_train) #60899572
        tweets_embeddings = tweets_embeddings.drop("tweet_hash", "pcs")
        tweets_embeddings = tweets_embeddings.withColumn("embedding", vect2array(F.col("embedding")))
        tweets_embeddings = tweets_embeddings.withColumn("embedding", F.col("embedding").cast(StringType()))
        tweets_embeddings = tweets_embeddings.withColumn("hash_tweet_id_1",
                                                         F.abs(F.hash("tweet_id")%num_partitions))
        tweets_embeddings.repartition(F.col("hash_tweet_id_1")).write.partitionBy("hash_tweet_id_1")\
                    .mode("overwrite").csv(tweet_embeddings_path_hdfs)
        print(tweets_embeddings)
        del tweets_embeddings
    else:
        print("Already exists tweets_embeddings")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

hdfs:///test-embeddings does not exist
Creating submission_path_hdfs
DataFrame[tweet_id: string, embedding: string, cluster: int, hash_tweet_id_1: int]

In [15]:
if submission:
    print("Submission")
    schema = StructType([StructField('tweet_id', StringType()),
                         StructField('embedding', StringType()),
                         StructField("cluster", IntegerType()),
                         StructField('hash_tweet_id_1', IntegerType())])
    tweets_embeddings = spark.read.csv(submission_path_hdfs, 
                                           schema=schema).repartition("hash_tweet_id_1")
    print(tweets_embeddings.rdd.getNumPartitions())
elif test:
    print("Test")
    schema = StructType([StructField('tweet_id', StringType()),
                         StructField('embedding', StringType()),
                         StructField("cluster", IntegerType()),
                         StructField('hash_tweet_id_1', IntegerType())])
    tweets_embeddings = spark.read.csv(test_path_hdfs, 
                                           schema=schema).repartition("hash_tweet_id_1")
    print(tweets_embeddings.rdd.getNumPartitions())
else:
    print("Training")
    schema = StructType([StructField('tweet_id', StringType()),
                         StructField('embedding', StringType()),
                         StructField("cluster", IntegerType()),
                         StructField('hash_tweet_id_1', IntegerType())])
    tweets_embeddings = spark.read.csv(tweet_embeddings_path_hdfs, 
                                       schema=schema).repartition("hash_tweet_id_1")
    print(tweets_embeddings.rdd.getNumPartitions())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test
1000

In [16]:
tweets_embeddings.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

8671930

## Join

In [17]:
df = df.join(tweets_embeddings, 
             (df.tweet_id_id==tweets_embeddings.tweet_id), 
             how="left")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
validator(df.select("cluster"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{}

In [19]:
df = df.drop("hash_tweet_id_1", "tweet_id")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
if submission or test:
    print("Submission or Test")
    columns = ['embedding', 'tweet_id_id', 'engaged_with_user_id_id', 'engaged_with_user_is_verified_bool',
               'engaging_user_id_id', 'engaging_user_is_verified_bool', 'engagee_follows_engager_bool', 
               'hashtagEncoded_unors', 'hashtagSumCount_ss_num', 'hashtagCount_ss_num', 'domainEncoded_unors', 
               'domainCount_ss_num', 'tweetEncoded_cat', 'languageEncoded_cat', 'tweet_timestamp_day_of_week_cat',
               'tweet_timestamp_hour_cat',
               'tweet_timestamp_to_engagee_account_creation_ss_num', 
               'tweet_timestamp_to_engaging_account_creation_ss_num',
               'engaged_with_vs_engaging_follower_diff_log_ss_num', 
               'engaged_with_vs_engaging_following_diff_log_ss_num', 'engaged_follow_diff_log_ss_num', 
               'engaging_follow_diff_log_ss_num', 'engaged_follower_diff_engaging_following_log_ss_num', 
               'engaged_following_diff_engaging_follower_log_ss_num', 'engaged_with_user_follower_count_log_ss_num', 
               'engaging_user_follower_count_log_ss_num', 'engaged_with_user_following_count_log_ss_num', 
               'engaging_user_following_count_log_ss_num', 'PhotoCount_ss_num', 'VideoCount_ss_num', 
               'GIFCount_ss_num', 'linkCount_ss_num', 'engaged_with_user_follower_count_q_cat', 
               'engaged_with_user_following_count_q_cat', 'engaged_with_user_account_creation_q_cat', 
               'engaging_user_follower_count_q_cat', 'engaging_user_following_count_q_cat', 
               'engaging_user_account_creation_q_cat', 'total_appearance_ss_num', 'perc_n_interactions_ss_num', 
               'perc_n_commented_ss_num', 'perc_n_liked_ss_num', 'perc_n_replied_ss_num', 'perc_n_retweeted_ss_num', 
               'engaged_with_user_id_bucket', 'engaging_user_id_bucket', 'cluster']
else:
    print("Training")
    columns = ['embedding', 'tweet_id_id', 'engaged_with_user_id_id', 'engaged_with_user_is_verified_bool',
               'engaging_user_id_id', 'engaging_user_is_verified_bool', 'engagee_follows_engager_bool', 
               'hashtagEncoded_unors', 'hashtagSumCount_ss_num', 'hashtagCount_ss_num', 'domainEncoded_unors', 
               'domainCount_ss_num', 'tweetEncoded_cat', 'languageEncoded_cat', 'tweet_timestamp_day_of_week_cat',
               'tweet_timestamp_hour_cat',
               'tweet_timestamp_to_engagee_account_creation_ss_num', 
               'tweet_timestamp_to_engaging_account_creation_ss_num',
               'engaged_with_vs_engaging_follower_diff_log_ss_num', 
               'engaged_with_vs_engaging_following_diff_log_ss_num', 'engaged_follow_diff_log_ss_num', 
               'engaging_follow_diff_log_ss_num', 'engaged_follower_diff_engaging_following_log_ss_num', 
               'engaged_following_diff_engaging_follower_log_ss_num', 'engaged_with_user_follower_count_log_ss_num', 
               'engaging_user_follower_count_log_ss_num', 'engaged_with_user_following_count_log_ss_num', 
               'engaging_user_following_count_log_ss_num', 'PhotoCount_ss_num', 'VideoCount_ss_num', 
               'GIFCount_ss_num', 'linkCount_ss_num', 'engaged_with_user_follower_count_q_cat', 
               'engaged_with_user_following_count_q_cat', 'engaged_with_user_account_creation_q_cat', 
               'engaging_user_follower_count_q_cat', 'engaging_user_following_count_q_cat', 
               'engaging_user_account_creation_q_cat', 'total_appearance_ss_num', 'perc_n_interactions_ss_num', 
               'perc_n_commented_ss_num', 'perc_n_liked_ss_num', 'perc_n_replied_ss_num', 'perc_n_retweeted_ss_num', 
               'indicator_reply', 'indicator_retweet', 'indicator_retweet_with_comment', 'indicator_like', 
               'indicator_interaction', 'engaged_with_user_id_bucket', 'engaging_user_id_bucket', 'cluster']

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Submission or Test

In [21]:
new_df = df.select(columns)
new_df = new_df.withColumnRenamed("embedding", "embedding_ors")
new_df = new_df.withColumnRenamed("cluster", "cluster_cat")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
new_df.rdd.getNumPartitions()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1000

In [23]:
new_df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[embedding_ors: string, tweet_id_id: string, engaged_with_user_id_id: string, engaged_with_user_is_verified_bool: string, engaging_user_id_id: string, engaging_user_is_verified_bool: string, engagee_follows_engager_bool: string, hashtagEncoded_unors: string, hashtagSumCount_ss_num: string, hashtagCount_ss_num: string, domainEncoded_unors: string, domainCount_ss_num: string, tweetEncoded_cat: string, languageEncoded_cat: string, tweet_timestamp_day_of_week_cat: string, tweet_timestamp_hour_cat: string, tweet_timestamp_to_engagee_account_creation_ss_num: string, tweet_timestamp_to_engaging_account_creation_ss_num: string, engaged_with_vs_engaging_follower_diff_log_ss_num: string, engaged_with_vs_engaging_following_diff_log_ss_num: string, engaged_follow_diff_log_ss_num: string, engaging_follow_diff_log_ss_num: string, engaged_follower_diff_engaging_following_log_ss_num: string, engaged_following_diff_engaging_follower_log_ss_num: string, engaged_with_user_follower_count_log_ss_n

In [24]:
if submission:
    current_new_df_path = "hdfs:///submission-embeddings-final"
    new_df.write.option("header","true").mode("overwrite").csv(current_new_df_path)
    print("Submission saved")
elif test:
    current_new_df_path = "hdfs:///test-embeddings-final"
    new_df.write.option("header","true").mode("overwrite").csv(current_new_df_path)
    print("Test saved")
else:
    if training:
        current_new_df_path = "hdfs:///train-embeddings"
        new_df.write.option("header","true").mode("overwrite").csv(current_new_df_path)
        print("Train saved")
    else:
        current_new_df_path = "hdfs:///val-embeddings"
        new_df.write.option("header","true").mode("overwrite").csv(current_new_df_path)
        print("Valid saved")   

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test saved

### Copy to s3

------

In [4]:
train_df = spark.read.option("header","true").csv("hdfs:///train-embeddings").coalesce(1000)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
train_df.rdd.getNumPartitions()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1000

In [6]:
smaller_train = train_df.sample(withReplacement=False,
                                fraction=0.45,
                                seed=42)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
smaller_train.rdd.getNumPartitions()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1000

In [8]:
smaller_new_df_path = "hdfs:///smaller-train-embeddings"
smaller_train.write.option("header","true").mode("overwrite").csv(smaller_new_df_path)
print("Train small saved")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Train small saved