# Data Pipeline

# Goal
Create a data pipeline for twitter challenge.

# Methodology
Create different features from the original data

## Sections
1. [**Requirements**](#Requirements)
2. [**Functions**](#Functions)
3. [**Inputs**](#Inputs)
4. [**Pipeline**](#Pipeline)
    - [**Indicators**](#Indicators)
    - [**Intention_features**](#Intention_features)
    - [**TopicEncodings**](#TopicEncodings)
    - [**EngagingFollowsEngaged**](#EngagingFollowsEngaged)
    - [**Hashtags**](#Hashtags)
    - [**Domain**](#Domain)
    - [**Language**](#Language)
    - [**Media**](#Media)
    - [**Links**](#Links)
    - [**Tweet_type**](#Tweet_type)
    - [**Timestamp_features**](#Timestamp_features)
    - [**Followers_and_Followings_features**](#Followers_and_Followings_features)
    - [**Quantile_Discretizer**](#Quantile_Discretizer)
    - [**Intentions_join**](#Intentions_join)
5. [**FeatureSelection**](#FeatureSelection)
6. [**Imputation**](#Imputation)
7. [**Validation**](#Validation)
8. [**Saving_df**](#Saving_df)

# Requirements

In [None]:
#installing packages
sc.install_pypi_package("pandas")
sc.install_pypi_package("boto3")

In [2]:
#reconfiguring SparkContext
sc.setCheckpointDir('hdfs:///twitter/checkpoints')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
import time
import os
import boto3
import gc
import sys
import numpy as np
import pandas as pd
import pickle
import pyspark
import subprocess
from pyspark.sql import SparkSession
from pyspark.sql.types import (FloatType, DateType, StructType, StructField, StringType, LongType, 
    IntegerType, ArrayType, BooleanType, DoubleType)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler, QuantileDiscretizer
gc.enable()

spark = SparkSession.builder.config("spark.sql.shuffle.partitions", 1000).appName("twitter").getOrCreate()
print(spark.sparkContext.getConf().get('spark.driver.memory'))
print(spark.sparkContext.getConf().get("spark.sql.shuffle.partitions"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2048M
1000

# Functions

## Loading data

In [4]:
def parse_data(path='training.tsv', has_labels=True, schema='auto'):
    """
    Parses the training data for the Twitter RecSys Challenge.
    """
    spark = SparkSession.builder.appName("twitter").getOrCreate()
    if schema == 'auto':
        schema = build_schema(has_labels)
    df = spark.read.csv(path, schema=schema, sep='\x01', encoding='utf-8',
                        ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True)
    df = df.withColumn('text_tokens', F.split('text_tokens', '\t'))
    df = df.withColumn('hashtags', F.split('hashtags', '\t'))
    df = df.withColumn('present_media', F.split('present_media', '\t'))
    df = df.withColumn('present_links', F.split('present_links', '\t'))
    df = df.withColumn('present_domains', F.split('present_domains', '\t'))
    return df

def build_schema(has_labels=True):
    if has_labels:
        schema = StructType([StructField('text_tokens', StringType()),
                             StructField('hashtags', StringType()),
                             StructField('tweet_id', StringType()),
                             StructField('present_media', StringType()),
                             StructField('present_links', StringType()),
                             StructField('present_domains', StringType()),
                             StructField('tweet_type', StringType()),
                             StructField('language', StringType()),
                             StructField('tweet_timestamp', LongType()),
                             StructField('engaged_with_user_id', StringType()),
                             StructField('engaged_with_user_follower_count', IntegerType()),
                             StructField('engaged_with_user_following_count', IntegerType()),
                             StructField('engaged_with_user_is_verified', BooleanType()),
                             StructField('engaged_with_user_account_creation', LongType()),
                             StructField('engaging_user_id', StringType()),
                             StructField('engaging_user_follower_count', IntegerType()),
                             StructField('engaging_user_following_count', IntegerType()),
                             StructField('engaging_user_is_verified', BooleanType()),
                             StructField('engaging_user_account_creation', LongType()),
                             StructField('engagee_follows_engager', BooleanType()),
                             StructField('reply_timestamp', LongType()),
                             StructField('retweet_timestamp', LongType()),
                             StructField('retweet_with_comment_timestamp', LongType()),
                             StructField('like_timestamp', LongType())
                            ])
    else:
         schema = StructType([StructField('text_tokens', StringType()),
                             StructField('hashtags', StringType()),
                             StructField('tweet_id', StringType()),
                             StructField('present_media', StringType()),
                             StructField('present_links', StringType()),
                             StructField('present_domains', StringType()),
                             StructField('tweet_type', StringType()),
                             StructField('language', StringType()),
                             StructField('tweet_timestamp', LongType()),
                             StructField('engaged_with_user_id', StringType()),
                             StructField('engaged_with_user_follower_count', IntegerType()),
                             StructField('engaged_with_user_following_count', IntegerType()),
                             StructField('engaged_with_user_is_verified', BooleanType()),
                             StructField('engaged_with_user_account_creation', LongType()),
                             StructField('engaging_user_id', StringType()),
                             StructField('engaging_user_follower_count', IntegerType()),
                             StructField('engaging_user_following_count', IntegerType()),
                             StructField('engaging_user_is_verified', BooleanType()),
                             StructField('engaging_user_account_creation', LongType()),
                             StructField('engagee_follows_engager', BooleanType())
                            ])
    return schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Preprocessing

In [5]:
def save_pkl_to_s3(obj, key_filename, bucket_name):
    serialized_obj = pickle.dumps(obj)
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket_name, Key=key_filename, 
                  Body=serialized_obj)
    
def hdfs_exists(path):
    proc = subprocess.Popen(['hadoop', 'fs', '-test', '-e', path])
    proc.communicate()
    if proc.returncode != 0:
        print(f"{path} does not exist")
        return False
    else : 
        print(f"{path} exist")
        return True
    
def validator(df):
    columns_w_nan = {}
    for col in df.schema:
        null_count = df.filter(F.col(col.name).isNull()).count()
        if null_count>0:
            columns_w_nan[col.name]=null_count
    return columns_w_nan

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Inputs

In [168]:
dictionary_size={"final-complete": {"val_size": 500000, 
                                    "train_size": "all"}}

training = False
smaller_train = False
submission = False
test = True

bucket='bucket-name'
s3_resource = boto3.resource('s3')
top_k_languages = 30
top_k_domains = 3000
top_k_hashtags = 13000

# Embeddings
num_partitions=1000

# Buckets
partition_per_cluster = 100

suffix_sample = "final-complete" #"full", "small", "medium", "sub_medium"
data_path = "final-data"
object_paths = "final-artifacts"

val_size = dictionary_size[suffix_sample]["val_size"]
train_size = dictionary_size[suffix_sample]["train_size"]

bucket_s3 = s3_resource.Bucket(bucket)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**Paths**

In [169]:
#S3
twitter_bucket_s3 = "s3a://bucket-name"
trainining_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "training.tsv")
submission_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "submission.tsv")
test_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "test.tsv")

# Splitted paths
train_path = os.path.join(twitter_bucket_s3, data_path, "train-"+suffix_sample)
val_path = os.path.join(twitter_bucket_s3, data_path, "val-"+suffix_sample)

# Processed
processed_train_path = os.path.join(twitter_bucket_s3, data_path, "processed", "smaller-train-"+suffix_sample)
processed_val_path = os.path.join(twitter_bucket_s3, data_path, "processed", "val-"+suffix_sample)
processed_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed", "submission-"+suffix_sample)
processed_test_path = os.path.join(twitter_bucket_s3, data_path, "processed", "test-"+suffix_sample)
processed_emb_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                        "train-"+suffix_sample)
processed_emb_smaller_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                        "smaller-train-"+suffix_sample)
processed_emb_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                      "val-"+suffix_sample)
processed_emb_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "submission-"+suffix_sample)
processed_emb_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "test-"+suffix_sample)
processed_top_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics-final", 
                                        "train-"+suffix_sample)
processed_top_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics-final", 
                                      "val-"+suffix_sample)
processed_top_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics-final", 
                                             "submission-"+suffix_sample)
processed_top_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics-final", 
                                             "test-"+suffix_sample)
# Resources
engaging_users_training_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-training")
engaging_users_submission_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-submission")
engaging_users_test_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-test")
intentions_path = os.path.join(twitter_bucket_s3, data_path, "intentions-"+suffix_sample)
map_user_bucket_path = os.path.join(twitter_bucket_s3, data_path, "map_user_bucket_new")

topic_encodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "user_topics")
users_intime_path = os.path.join(twitter_bucket_s3, data_path, "users_intime-"+suffix_sample)

# keys objects
key_hashtag_mapping = os.path.join(object_paths, f'hashtag_mapping_{suffix_sample}.pkl')
key_domain_mapping = os.path.join(object_paths, f'domain_mapping_{suffix_sample}.pkl')
key_language_mapping = os.path.join(object_paths, f'language_mapping_{suffix_sample}.pkl')
key_hashtag_count = os.path.join(object_paths, f'hashtag_count_{suffix_sample}.pkl')
key_domain_count = os.path.join(object_paths, f'domain_count_{suffix_sample}.pkl')
key_scaling_features = os.path.join(object_paths, f'scaling_dictionary_{suffix_sample}.pkl')
key_diff_min = os.path.join(object_paths, f'diff_min_{suffix_sample}.pkl')
key_impute_perc = os.path.join(object_paths, f'dict_mean_perc_{suffix_sample}.pkl')
key_topiccount = os.path.join(object_paths, f'topiccount_{suffix_sample}.pkl')

# s3+keys
columns = ["engaged_with_user_follower_count", "engaged_with_user_following_count",
           "engaged_with_user_account_creation", "engaging_user_follower_count",
           "engaging_user_following_count", "engaging_user_account_creation"]
qds_paths = {}
for col in columns:
    qds_paths[col] = os.path.join(twitter_bucket_s3, object_paths, f"qs_{suffix_sample}_" + col)
    
# Bucket pipeline
users_buckets = os.path.join(twitter_bucket_s3, data_path, "users_buckets") #
users_buckets_part_2 = os.path.join(twitter_bucket_s3, data_path, "users_buckets_part_2") #

pipeline_kmeans_path = os.path.join(twitter_bucket_s3, object_paths, "pipeline_id_encoding")
cluster_map_path = os.path.join(twitter_bucket_s3, data_path, "cluster_map")

# Embeddings
bert_embeddings_train = os.path.join(twitter_bucket_s3, "data", "textEncodings", "tweets_extended")
submission_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", 
                                                 "submission-tweets-extended")
test_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "test-tweets-extended")

# Topics pipeline
reduced_topics_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "reducedTopics")

# Graph Features
graph_features_path = os.path.join(twitter_bucket_s3, data_path, "processed-graph")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load final topics data

In [170]:
if training:
    print("Train")
    hdfs_original_path = "hdfs:///train-complete-cleaned"
    df = spark.read.option("header", "true").csv(processed_top_train_path)
elif test:
    print("Test")
    hdfs_original_path = "hdfs:///test-complete-cleaned"
    df = spark.read.option("header", "true").csv(processed_top_test_path)
elif submission:
    print("Submission")
    hdfs_original_path = "hdfs:///submission-complete-cleaned"
    df = spark.read.option("header", "true").csv(processed_top_submission_path)
else:
    print("Valid")
    hdfs_original_path = "hdfs:///val-complete-cleaned"
    df = spark.read.option("header", "true").csv(processed_top_val_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test

## Cleaning data

In [171]:
keep_columns = []
for col in df.columns:
    if "topicprop" in col:
        pass
    elif "perc_" in col:
        pass
    else:
        keep_columns.append(col)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [172]:
df = df.select(keep_columns)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## New features

In [173]:
key_scaling_new_features = os.path.join(object_paths, f'scaling_new_features_{suffix_sample}.pkl')
if training: 
    scaling_dict = dict()
else:
    scaling_dict = pickle.loads(s3_resource.Bucket(bucket).Object(key_scaling_new_features).get()['Body'].read())
    assert type(scaling_dict) == dict

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**author aggregates**

In [174]:
author_agg_path = "hdfs:///author_agg"
if training:
    if hdfs_exists(author_agg_path):
        print("Already exists")
    else:
        author_agg = df.groupby("engaged_with_user_id_id")\
                                .agg(F.count("engaged_with_user_id_id").alias("n_appearances_author"), 
                                     F.countDistinct(F.col("tweet_id_id")).alias("n_tweets"))
        author_agg.write.option("header", "true").mode("overwrite").csv(author_agg_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [175]:
author_agg_path = "s3a://bucket-name/final-data/author-agg"
schema = StructType([StructField('engaged_with_user_id_id', StringType()),
                     StructField('n_appearances_author', IntegerType()),
                     StructField('n_tweets', IntegerType())])
author_agg = spark.read.csv(author_agg_path, schema=schema)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [176]:
df = df.join(author_agg, 
             on="engaged_with_user_id_id", 
             how="left")
df = df.withColumn("n_appearances_author", F.when(F.col("n_appearances_author").isNull(), 1)\
                                                  .otherwise(F.col("n_appearances_author")))
df = df.withColumn("n_tweets", F.when(F.col("n_tweets").isNull(), 1)\
                                    .otherwise(F.col("n_tweets")))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [177]:
if training:
    mean_col, sttdev_col = df.select(F.mean('n_appearances_author'),
                                     F.stddev('n_appearances_author')).first()
    # Saving scaling features
    scaling_dict['n_appearances_author'] = { 'mean': mean_col, 'std': sttdev_col} 
    
mean_col = scaling_dict['n_appearances_author']['mean']    
sttdev_col = scaling_dict['n_appearances_author']['std']

df = df.withColumn('n_appearances_author'+'_ss', 
                   (F.col('n_appearances_author') - mean_col) / (2*sttdev_col))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [178]:
if training: 
    mean_col, sttdev_col = df.select(F.mean('n_tweets'),
                                     F.stddev('n_tweets')).first()
    # Saving scaling features
    scaling_dict['n_tweets'] = { 'mean': mean_col, 'std': sttdev_col} 
mean_col = scaling_dict['n_tweets']['mean']    
sttdev_col = scaling_dict['n_tweets']['std']

df = df.withColumn('n_tweets'+'_ss', 
                   (F.col('n_tweets') - mean_col) / (2*sttdev_col))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [179]:
df = df.drop("n_appearances_author", "n_tweets")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**Couples**

In [180]:
couple_path = "hdfs:///couple_total_count"
if training:
    if hdfs_exists(couple_path):
        print("Already exists")
    else:
        couple_count = df.groupby(F.col("engaged_with_user_id_id").alias("user_1"),
                                  F.col("engaging_user_id_id").alias("user_2")).count()
        couple_count_2 = couple_count.withColumnRenamed("user_1", "user_1_")
        couple_count_2 = couple_count_2.withColumnRenamed("user_2", "user_2_")
        couple_count_2 = couple_count_2.withColumnRenamed("count", "count_")
        couple_total = couple_count.join(couple_count_2,
                                         (couple_count.user_1==couple_count_2.user_2_)&\
                                         (couple_count.user_2==couple_count_2.user_1_), 
                                         how="outer")
        couple_total = couple_total.withColumn("total_count_couple",
                                               F.when(F.col("count_").isNull(), 
                                                      F.col("count"))\
                                               .otherwise(F.when(F.col("count").isNull(), 
                                                                 F.col("count_"))\
                                                          .otherwise(F.col("count") + F.col("count_"))))
        couple_total = couple_total.withColumn("user_id_1",
                                               F.when(F.col("count_").isNull(), 
                                                      F.col("user_1"))\
                                               .otherwise(F.col("user_2_")))
        couple_total = couple_total.withColumn("user_id_2",
                                               F.when(F.col("count_").isNull(), 
                                                      F.col("user_2"))\
                                               .otherwise(F.col("user_1_")))
        couple_total = couple_total.drop("user_1", "user_2", "count", "user_1_", "user_2_", "count_")
        couple_total.write.option("header", "true").csv(couple_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [181]:
couple_path = "s3://bucket-name/final-data/couple-total-count"
schema = StructType([StructField('total_count_couple', IntegerType()),
                     StructField('user_id_1', StringType()),
                     StructField('user_id_2', StringType())])
couple_total = spark.read.option("header", "true").csv(couple_path, schema=schema)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [182]:
df = df.join(couple_total, 
             (df.engaged_with_user_id_id==couple_total.user_id_1)&\
             (df.engaging_user_id_id==couple_total.user_id_2), 
             how="left")
df = df.drop("user_id_1", "user_id_2")
df = df.withColumn("total_count_couple", F.when(F.col("total_count_couple").isNull(), 1)\
                                                  .otherwise(F.col("total_count_couple")))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [183]:
if training: 
    mean_col, sttdev_col = df.select(F.mean('total_count_couple'),
                                     F.stddev('total_count_couple')).first()
    # Saving scaling features
    scaling_dict['total_count_couple'] = { 'mean': mean_col, 'std': sttdev_col} 
    
mean_col = scaling_dict['total_count_couple']['mean']    
sttdev_col = scaling_dict['total_count_couple']['std']

df = df.withColumn('total_count_couple'+'_ss', 
                   (F.col('total_count_couple') - mean_col) / (2*sttdev_col))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [184]:
df = df.drop("total_count_couple")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----

# Join with graph features

In [185]:
if training:
    print("Train")
    graph_data_path = os.path.join(graph_features_path, "graph_train.csv")
    graph_df = spark.read.option("header", "true").csv(graph_data_path)
elif test:
    print("Test")
    graph_data_path = os.path.join(graph_features_path, "graph_test.csv")
    graph_df = spark.read.option("header", "true").csv(graph_data_path)
elif submission:
    print("Submission")
    graph_data_path = os.path.join(graph_features_path, "graph_sub.csv")
    graph_df = spark.read.option("header", "true").csv(graph_data_path)
else:
    print("Valid")
    graph_data_path = os.path.join(graph_features_path, "graph_val.csv")
    graph_df = spark.read.option("header", "true").csv(graph_data_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test

In [186]:
graph_df = graph_df.drop("engaged_with_user_id_id", "indicator_interaction", "n_engaged_with_id", "n_engaging_id")
graph_df = graph_df.withColumnRenamed("tweet_id_id", "tweet_id_id_")
graph_df = graph_df.withColumnRenamed("engaging_user_id_id", "engaging_user_id_id_")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [187]:
df = df.join(graph_df, 
            (df.tweet_id_id==graph_df.tweet_id_id_)&\
            (df.engaging_user_id_id==graph_df.engaging_user_id_id_),
            how="left")
df = df.drop("tweet_id_id_", "engaging_user_id_id_")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [188]:
graph_columns = ["jaccard", 'len1', 'len2', 'union', 'intersec']

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [189]:
for graph_col in graph_columns:
    if training: 
        mean_col, sttdev_col = df.select(F.mean(graph_col),
                                         F.stddev(graph_col)).first()
        # Saving scaling features
        scaling_dict[graph_col] = { 'mean': mean_col, 'std': sttdev_col} 

    mean_col = scaling_dict[graph_col]['mean']    
    sttdev_col = scaling_dict[graph_col]['std']

    df = df.withColumn(graph_col+'_ss', 
                       (F.col(graph_col) - mean_col) / (2*sttdev_col))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [190]:
df = df.drop(*graph_columns)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [191]:
df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[engaged_with_user_id_id: string, embedding_ors: string, tweet_id_id: string, engaged_with_user_is_verified_bool: string, engaging_user_id_id: string, engaging_user_is_verified_bool: string, engagee_follows_engager_bool: string, hashtagEncoded_unors: string, hashtagSumCount_ss_num: string, hashtagCount_ss_num: string, domainEncoded_unors: string, domainCount_ss_num: string, tweetEncoded_cat: string, languageEncoded_cat: string, tweet_timestamp_day_of_week_cat: string, tweet_timestamp_hour_cat: string, tweet_timestamp_to_engagee_account_creation_ss_num: string, tweet_timestamp_to_engaging_account_creation_ss_num: string, engaged_with_vs_engaging_follower_diff_log_ss_num: string, engaged_with_vs_engaging_following_diff_log_ss_num: string, engaged_follow_diff_log_ss_num: string, engaging_follow_diff_log_ss_num: string, engaged_follower_diff_engaging_following_log_ss_num: string, engaged_following_diff_engaging_follower_log_ss_num: string, engaged_with_user_follower_count_log_ss_n

# Saving scaling_new_features_dict

In [192]:
if training: 
    # Saving scaling dictionary to pickle
    save_pkl_to_s3(scaling_dict, key_scaling_new_features, bucket)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### EngagingFollowsEngaged

In [193]:
user_1_follows_user_2_path = "hdfs:///user_1_follows_user_2"
if training:
    if hdfs_exists(user_1_follows_user_2_path):
        print("Already exists")
    else:
        user_1_follows_user_2 = df.select(F.col("engaged_with_user_id_id").alias("user_1"),
                                          F.col("engaging_user_id_id").alias("user_2"), 
                                          F.col("engagee_follows_engager_bool").alias("user1_follows_user2"))
        user_1_follows_user_2 = user_1_follows_user_2.dropDuplicates(['user_1', 'user_2'])
        user_1_follows_user_2.write.csv(user_1_follows_user_2_path)
schema = StructType([StructField('user_1', StringType()),
                     StructField('user_2', StringType()),
                     StructField('user1_follows_user2', BooleanType())])
user_1_follows_user_2 = spark.read.csv(user_1_follows_user_2_path, schema=schema)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [194]:
df = df.join(user_1_follows_user_2, 
             (df.engaging_user_id_id == user_1_follows_user_2.user_1) &\
             (df.engaged_with_user_id_id == user_1_follows_user_2.user_2), 
             how="left")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [195]:
df = df.drop("user_1", "user_2")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [196]:
df = df.withColumnRenamed("user1_follows_user2", "engaging_follows_engaged")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [197]:
df = df.withColumn("engaging_follows_engaged", F.col("engaging_follows_engaged").cast("integer"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [198]:
df = df.withColumn("engaging_follows_engaged", F.when(F.col("engaging_follows_engaged").isNull(), 
                                                       2).otherwise(F.col("engaging_follows_engaged")))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Map New user buckets

In [199]:
map_user_bucket = spark.read.csv(map_user_bucket_path, schema= StructType([StructField('user_id', StringType()),
                                                                           StructField('final_bucket', IntegerType())]))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [200]:
df = df.join(map_user_bucket, df.engaged_with_user_id_id==map_user_bucket.user_id, how="left")
df = df.drop("user_id")
df = df.withColumnRenamed("final_bucket", "engaged_with_user_id_bucket_2")

df = df.join(map_user_bucket, df.engaging_user_id_id==map_user_bucket.user_id, how="left")
df = df.drop("user_id")
df = df.withColumnRenamed("final_bucket", "engaging_user_id_bucket_2")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Validator

In [201]:
validator(df.select("jaccard_ss", 'total_count_couple'+'_ss', "n_tweets_ss", "engaging_user_id_bucket_2"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{}

In [202]:
df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[engaged_with_user_id_id: string, embedding_ors: string, tweet_id_id: string, engaged_with_user_is_verified_bool: string, engaging_user_id_id: string, engaging_user_is_verified_bool: string, engagee_follows_engager_bool: string, hashtagEncoded_unors: string, hashtagSumCount_ss_num: string, hashtagCount_ss_num: string, domainEncoded_unors: string, domainCount_ss_num: string, tweetEncoded_cat: string, languageEncoded_cat: string, tweet_timestamp_day_of_week_cat: string, tweet_timestamp_hour_cat: string, tweet_timestamp_to_engagee_account_creation_ss_num: string, tweet_timestamp_to_engaging_account_creation_ss_num: string, engaged_with_vs_engaging_follower_diff_log_ss_num: string, engaged_with_vs_engaging_following_diff_log_ss_num: string, engaged_follow_diff_log_ss_num: string, engaging_follow_diff_log_ss_num: string, engaged_follower_diff_engaging_following_log_ss_num: string, engaged_following_diff_engaging_follower_log_ss_num: string, engaged_with_user_follower_count_log_ss_n

# Saving dataset to hdfs

In [203]:
if training:
    print("Train")
    hdfs_original_path = "hdfs:///train-complete-cleaned"
    df.write.option("header", "true").csv(hdfs_original_path)
elif test:
    print("Test")
    hdfs_original_path = "hdfs:///test-complete-cleaned"
    df.write.option("header", "true").csv(hdfs_original_path)
elif submission:
    print("Submission")
    hdfs_original_path = "hdfs:///submission-complete-cleaned"
    df.write.option("header", "true").csv(hdfs_original_path)
else:
    print("Valid")
    hdfs_original_path = "hdfs:///val-complete-cleaned"
    df.write.option("header", "true").csv(hdfs_original_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test