# Data Pipeline

# Goal
Create a data pipeline for twitter challenge.

# Methodology
Create different features from the original data

## Sections
1. [**Requirements**](#Requirements)
2. [**Functions**](#Functions)
3. [**Inputs**](#Inputs)
4. [**Pipeline**](#Pipeline)
    - [**Indicators**](#Indicators)
    - [**Intention_features**](#Intention_features)
    - [**TopicEncodings**](#TopicEncodings)
    - [**EngagingFollowsEngaged**](#EngagingFollowsEngaged)
    - [**Hashtags**](#Hashtags)
    - [**Domain**](#Domain)
    - [**Language**](#Language)
    - [**Media**](#Media)
    - [**Links**](#Links)
    - [**Tweet_type**](#Tweet_type)
    - [**Timestamp_features**](#Timestamp_features)
    - [**Followers_and_Followings_features**](#Followers_and_Followings_features)
    - [**Quantile_Discretizer**](#Quantile_Discretizer)
    - [**Intentions_join**](#Intentions_join)
5. [**FeatureSelection**](#FeatureSelection)
6. [**Imputation**](#Imputation)
7. [**Validation**](#Validation)
8. [**Saving_df**](#Saving_df)

# Requirements

In [1]:
#installing packages
sc.install_pypi_package("pandas")
sc.install_pypi_package("boto3")

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1591231044582_0002,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas
  Using cached pandas-1.0.4-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)
Collecting python-dateutil>=2.6.1
  Using cached python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
Installing collected packages: python-dateutil, pandas
Successfully installed pandas-1.0.4 python-dateutil-2.8.1

Collecting boto3
  Using cached boto3-1.13.22-py2.py3-none-any.whl (128 kB)
Collecting s3transfer<0.4.0,>=0.3.0
  Using cached s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
Collecting botocore<1.17.0,>=1.16.22
  Using cached botocore-1.16.22-py2.py3-none-any.whl (6.2 MB)
Collecting docutils<0.16,>=0.10
  Using cached docutils-0.15.2-py3-none-any.whl (547 kB)
Collecting urllib3<1.26,>=1.20; python_version != "3.4"
  Using cached urllib3-1.25.9-py2.py3-none-any.whl (126 kB)
Installing collected packages: docutils, urllib3, botocore, s3transfer, boto3
Successfully installed boto3-1.13.22 botocore-1.16.22 docutils-0.15.2 s3transfer-0.3.3 urllib3-1.25.9

In [2]:
#reconfiguring SparkContext
sc.setCheckpointDir('hdfs:///twitter/checkpoints')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
import time
import os
import boto3
import gc
import sys
import numpy as np
import pandas as pd
import pickle
import pyspark
import subprocess
from pyspark.sql import SparkSession
from pyspark.sql.types import (FloatType, DateType, StructType, StructField, StringType, LongType, 
    IntegerType, ArrayType, BooleanType, DoubleType)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler, QuantileDiscretizer
gc.enable()

spark = SparkSession.builder.config("spark.sql.shuffle.partitions", 1000).appName("twitter").getOrCreate()
print(spark.sparkContext.getConf().get('spark.driver.memory'))
print(spark.sparkContext.getConf().get("spark.sql.shuffle.partitions"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2048M
1000

# Functions

## Loading data

In [4]:
def parse_data(path='training.tsv', has_labels=True, schema='auto'):
    """
    Parses the training data for the Twitter RecSys Challenge.
    """
    spark = SparkSession.builder.appName("twitter").getOrCreate()
    if schema == 'auto':
        schema = build_schema(has_labels)
    df = spark.read.csv(path, schema=schema, sep='\x01', encoding='utf-8',
                        ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True)
    df = df.withColumn('text_tokens', F.split('text_tokens', '\t'))
    df = df.withColumn('hashtags', F.split('hashtags', '\t'))
    df = df.withColumn('present_media', F.split('present_media', '\t'))
    df = df.withColumn('present_links', F.split('present_links', '\t'))
    df = df.withColumn('present_domains', F.split('present_domains', '\t'))
    return df

def build_schema(has_labels=True):
    if has_labels:
        schema = StructType([StructField('text_tokens', StringType()),
                             StructField('hashtags', StringType()),
                             StructField('tweet_id', StringType()),
                             StructField('present_media', StringType()),
                             StructField('present_links', StringType()),
                             StructField('present_domains', StringType()),
                             StructField('tweet_type', StringType()),
                             StructField('language', StringType()),
                             StructField('tweet_timestamp', LongType()),
                             StructField('engaged_with_user_id', StringType()),
                             StructField('engaged_with_user_follower_count', IntegerType()),
                             StructField('engaged_with_user_following_count', IntegerType()),
                             StructField('engaged_with_user_is_verified', BooleanType()),
                             StructField('engaged_with_user_account_creation', LongType()),
                             StructField('engaging_user_id', StringType()),
                             StructField('engaging_user_follower_count', IntegerType()),
                             StructField('engaging_user_following_count', IntegerType()),
                             StructField('engaging_user_is_verified', BooleanType()),
                             StructField('engaging_user_account_creation', LongType()),
                             StructField('engagee_follows_engager', BooleanType()),
                             StructField('reply_timestamp', LongType()),
                             StructField('retweet_timestamp', LongType()),
                             StructField('retweet_with_comment_timestamp', LongType()),
                             StructField('like_timestamp', LongType())
                            ])
    else:
         schema = StructType([StructField('text_tokens', StringType()),
                             StructField('hashtags', StringType()),
                             StructField('tweet_id', StringType()),
                             StructField('present_media', StringType()),
                             StructField('present_links', StringType()),
                             StructField('present_domains', StringType()),
                             StructField('tweet_type', StringType()),
                             StructField('language', StringType()),
                             StructField('tweet_timestamp', LongType()),
                             StructField('engaged_with_user_id', StringType()),
                             StructField('engaged_with_user_follower_count', IntegerType()),
                             StructField('engaged_with_user_following_count', IntegerType()),
                             StructField('engaged_with_user_is_verified', BooleanType()),
                             StructField('engaged_with_user_account_creation', LongType()),
                             StructField('engaging_user_id', StringType()),
                             StructField('engaging_user_follower_count', IntegerType()),
                             StructField('engaging_user_following_count', IntegerType()),
                             StructField('engaging_user_is_verified', BooleanType()),
                             StructField('engaging_user_account_creation', LongType()),
                             StructField('engagee_follows_engager', BooleanType())
                            ])
    return schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Preprocessing

In [5]:
def save_pkl_to_s3(obj, key_filename, bucket_name):
    serialized_obj = pickle.dumps(obj)
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket_name, Key=key_filename, 
                  Body=serialized_obj)
    
def hdfs_exists(path):
    proc = subprocess.Popen(['hadoop', 'fs', '-test', '-e', path])
    proc.communicate()
    if proc.returncode != 0:
        print(f"{path} does not exist")
        return False
    else : 
        print(f"{path} exist")
        return True
    
def validator(df):
    columns_w_nan = {}
    for col in df.schema:
        null_count = df.filter(F.col(col.name).isNull()).count()
        if null_count>0:
            columns_w_nan[col.name]=null_count
    return columns_w_nan

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Inputs

In [110]:
dictionary_size={"final-complete": {"val_size": 500000, 
                                    "train_size": "all"}}

training = False
smaller_train = True
submission = False
test = False

bucket='bucket-name'
s3_resource = boto3.resource('s3')
top_k_languages = 30
top_k_domains = 3000
top_k_hashtags = 13000

# Embeddings
num_partitions=1000

# Buckets
partition_per_cluster = 100

suffix_sample = "final-complete" #"full", "small", "medium", "sub_medium"
data_path = "final-data"
object_paths = "final-artifacts"

val_size = dictionary_size[suffix_sample]["val_size"]
train_size = dictionary_size[suffix_sample]["train_size"]

bucket_s3 = s3_resource.Bucket(bucket)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**Paths**

In [111]:
#S3
twitter_bucket_s3 = "s3a://bucket-name"
trainining_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "training.tsv")
submission_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "submission.tsv")
test_path = os.path.join(twitter_bucket_s3, "data", "raw", "final", "test.tsv")

# Splitted paths
train_path = os.path.join(twitter_bucket_s3, data_path, "train-"+suffix_sample)
val_path = os.path.join(twitter_bucket_s3, data_path, "val-"+suffix_sample)

# Processed
processed_train_path = os.path.join(twitter_bucket_s3, data_path, "processed", "smaller-train-"+suffix_sample)
processed_val_path = os.path.join(twitter_bucket_s3, data_path, "processed", "val-"+suffix_sample)
processed_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed", "submission-"+suffix_sample)
processed_test_path = os.path.join(twitter_bucket_s3, data_path, "processed", "test-"+suffix_sample)
processed_emb_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                        "train-"+suffix_sample)
processed_emb_smaller_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                        "smaller-train-"+suffix_sample)
processed_emb_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                      "val-"+suffix_sample)
processed_emb_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "submission-"+suffix_sample)
processed_emb_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-embeddings-final", 
                                         "test-"+suffix_sample)
processed_top_train_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                        "train-"+suffix_sample)
processed_top_val_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                      "val-"+suffix_sample)
processed_top_submission_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                             "submission-"+suffix_sample)
processed_top_test_path = os.path.join(twitter_bucket_s3, data_path, "processed-topics", 
                                             "test-"+suffix_sample)
# Resources
engaging_users_training_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-training")
engaging_users_submission_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-submission")
engaging_users_test_path = os.path.join(twitter_bucket_s3, data_path, "engaging-users-test")
intentions_path = os.path.join(twitter_bucket_s3, data_path, "intentions-"+suffix_sample)
map_user_bucket_path = os.path.join(twitter_bucket_s3, data_path, "map_user_bucket")

topic_encodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "user_topics")
users_intime_path = os.path.join(twitter_bucket_s3, data_path, "users_intime-"+suffix_sample)

# keys objects
key_hashtag_mapping = os.path.join(object_paths, f'hashtag_mapping_{suffix_sample}.pkl')
key_domain_mapping = os.path.join(object_paths, f'domain_mapping_{suffix_sample}.pkl')
key_language_mapping = os.path.join(object_paths, f'language_mapping_{suffix_sample}.pkl')
key_hashtag_count = os.path.join(object_paths, f'hashtag_count_{suffix_sample}.pkl')
key_domain_count = os.path.join(object_paths, f'domain_count_{suffix_sample}.pkl')
key_scaling_features = os.path.join(object_paths, f'scaling_dictionary_{suffix_sample}.pkl')
key_diff_min = os.path.join(object_paths, f'diff_min_{suffix_sample}.pkl')
key_impute_perc = os.path.join(object_paths, f'dict_mean_perc_{suffix_sample}.pkl')
key_topiccount = os.path.join(object_paths, f'topiccount_{suffix_sample}.pkl')

# s3+keys
columns = ["engaged_with_user_follower_count", "engaged_with_user_following_count",
           "engaged_with_user_account_creation", "engaging_user_follower_count",
           "engaging_user_following_count", "engaging_user_account_creation"]
qds_paths = {}
for col in columns:
    qds_paths[col] = os.path.join(twitter_bucket_s3, object_paths, f"qs_{suffix_sample}_" + col)
    
# Bucket pipeline
users_buckets = os.path.join(twitter_bucket_s3, data_path, "users_buckets") #
users_buckets_part_2 = os.path.join(twitter_bucket_s3, data_path, "users_buckets_part_2") #

pipeline_kmeans_path = os.path.join(twitter_bucket_s3, object_paths, "pipeline_id_encoding")
cluster_map_path = os.path.join(twitter_bucket_s3, data_path, "cluster_map")

# Embeddings
bert_embeddings_train = os.path.join(twitter_bucket_s3, "data", "textEncodings", "tweets_extended")
submission_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", 
                                                 "submission-tweets-extended")
test_rawTweetEncodings_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "test-tweets-extended")

# Topics pipeline
reduced_topics_path = os.path.join(twitter_bucket_s3, "data", "textEncodings", "reducedTopics")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load intentions

In [112]:
if training:
    schema = StructType([StructField('engaging_user_id', StringType()),
                         StructField('total_appearance', LongType()),
                         StructField('perc_n_interactions', DoubleType()),
                         StructField('perc_n_commented', DoubleType()),
                         StructField('perc_n_liked', DoubleType()),
                         StructField('perc_n_replied', DoubleType()),
                         StructField('perc_n_retweeted', DoubleType())])
    intention_df = spark.read.csv(intentions_path, schema=schema)
    user_id_train = intention_df.select("engaging_user_id")
    user_id_train = user_id_train.withColumn("hash_engaging_user_id", 
                                             F.abs(F.hash("engaging_user_id")%num_partitions))
    user_id_train = user_id_train.repartition("hash_engaging_user_id")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load reducedTopics

In [113]:
if training:
    reduced_topics = spark.read.parquet(reduced_topics_path)
    reduced_topics = reduced_topics.drop("hash")
    reduced_topics = reduced_topics.select('engaging_user_id', 
                              F.col('rp0_topic').alias("0_topic"), 
                              F.col('rp1_topic').alias("1_topic"), 
                              F.col('rp2_topic').alias("2_topic"), 
                              F.col('rp3_topic').alias("3_topic"), 
                              F.col('rp4_topic').alias("4_topic"), 
                              F.col('rp0_topiccount').alias("0_topiccount"),
                              F.col('rp1_topiccount').alias("1_topiccount"), 
                              F.col('rp2_topiccount').alias("2_topiccount"), 
                              F.col('rp3_topiccount').alias("3_topiccount"), 
                              F.col('rp4_topiccount').alias("4_topiccount"), 
                              'rp0_topicprop', 'rp1_topicprop', 'rp2_topicprop', 'rp3_topicprop', 'rp4_topicprop', 
                              'rt0_topicprop', 'rt1_topicprop', 'rt2_topicprop', 'rt3_topicprop', 'rt4_topicprop', 
                              'rtc0_topicprop', 'rtc1_topicprop', 'rtc2_topicprop', 'rtc3_topicprop', 'rtc4_topicprop', 
                              'lk0_topicprop', 'lk1_topicprop', 'lk2_topicprop', 'lk3_topicprop', 'lk4_topicprop')
    reduced_topics = reduced_topics.withColumn("hash_engaging_user_id", 
                                                 F.abs(F.hash("engaging_user_id")%num_partitions))
    reduced_topics = reduced_topics.repartition("hash_engaging_user_id")
    reduced_topics = reduced_topics.withColumnRenamed("hash_engaging_user_id", "hash_engaging_user_id_1")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Joining limited users

In [114]:
reduced_topics_train_path = "hdfs:///reduced_topics_train"
topiccount_cols = ["0_topiccount", "1_topiccount", "2_topiccount", "3_topiccount", "4_topiccount"]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [115]:
if training:
    topiccount_cols = ["0_topiccount", "1_topiccount", "2_topiccount", "3_topiccount", "4_topiccount"]
    reduced_topics_train = user_id_train.join(reduced_topics, 
                                              on="engaging_user_id", how="left")
    reduced_topics_train = reduced_topics_train.drop("hash_engaging_user_id_1")
    reduced_topics_train = reduced_topics_train.withColumnRenamed("hash_engaging_user_id", "hash_engaging_user_id_1")
    for col_i in topiccount_cols:
        reduced_topics_train_log = reduced_topics_train.withColumn(col_i, F.log(F.col(col_i)+1))
        
    topiccount_0 = reduced_topics_train_log.select(F.col("0_topiccount").alias("topiccount"))
    topiccount_1 = reduced_topics_train_log.select(F.col("1_topiccount").alias("topiccount"))
    topiccount_2 = reduced_topics_train_log.select(F.col("2_topiccount").alias("topiccount"))
    topiccount_3 = reduced_topics_train_log.select(F.col("3_topiccount").alias("topiccount"))
    topiccount_4 = reduced_topics_train_log.select(F.col("4_topiccount").alias("topiccount"))
    
    topiccount = topiccount_0.union(topiccount_1)
    topiccount = topiccount.union(topiccount_2)
    topiccount = topiccount.union(topiccount_3)
    topiccount = topiccount.union(topiccount_4)

    topiccount_mean, topiccount_std = topiccount.select(F.mean("topiccount"), F.stddev("topiccount")).first()
    # Saving scaling features
    scaling_topiccount_dict = {'mean': topiccount_mean, 'std': topiccount_std}
    save_pkl_to_s3(scaling_topiccount_dict, key_topiccount, bucket)
    reduced_topics_train.write.option("header", "true").csv(reduced_topics_train_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load training

In [116]:
if submission:
    print("Submission")
    current_hdfs_path = "hdfs:///submission-df"
elif test:
    print("Test")
    current_hdfs_path = "hdfs:///test-df"
elif training:
    print("Train")
    current_hdfs_path = "hdfs:///train-df"
elif smaller_train:
    print("Smaller Train")
    current_hdfs_path = "hdfs:///smaller-train-df"
else:
    print("Valid")
    current_hdfs_path = "hdfs:///valid-df"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Smaller Train

In [117]:
if submission:
    print("Submission")
    df = spark.read.option("header","true").csv(processed_emb_submission_path)
elif test:
    print("Test")
    df = spark.read.option("header","true").csv(processed_emb_test_path)
elif training:
    print("Train")
    df = spark.read.option("header","true").csv(processed_emb_train_path)
elif smaller_train:
    print("Smaller Train")
    df = spark.read.option("header","true").csv(processed_emb_smaller_train_path)
else:
    print("Valid")
    df = spark.read.option("header","true").csv(processed_emb_val_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Smaller Train

In [118]:
df = df.withColumn("hash_engaging_user_id", F.abs(F.hash("engaging_user_id_id")%num_partitions))
df = df.repartition("hash_engaging_user_id")
df.write.option("header","true").csv(current_hdfs_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [119]:
del df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

------

# DF JOIN TOPICS

In [120]:
scaling_topiccount_dict = pickle.loads(s3_resource.Bucket(bucket).Object(key_topiccount).get()['Body'].read())
assert type(scaling_topiccount_dict) == dict

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [121]:
reduced_topics_train = spark.read.option("header", "true").csv(reduced_topics_train_path)\
                        .repartition("hash_engaging_user_id_1")
for col_i in topiccount_cols:
    reduced_topics_train = reduced_topics_train.withColumn(col_i,F.col(col_i).cast(DoubleType()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [122]:
df = spark.read.option("header", "true").csv(current_hdfs_path).repartition("hash_engaging_user_id")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [123]:
df.rdd.getNumPartitions()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1000

In [124]:
df = df.join(reduced_topics_train,
             df.engaging_user_id_id==reduced_topics_train.engaging_user_id,
             how="left")
df = df.drop("engaging_user_id", "hash_engaging_user_id_1", "hash_engaging_user_id")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [125]:
topic_count_columns = [i for i in df.columns if "topiccount" in i]
topic_prop_columns = [i for i in df.columns if "topicprop" in i]
topic_columns = [i for i in df.columns if i.endswith("_topic")]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [126]:
for col_i in topic_columns:
    df = df.withColumn(col_i, F.when(F.col(col_i).isNotNull(), F.col(col_i)).otherwise(150))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [127]:
for col_i in topic_prop_columns:
    df = df.withColumn(col_i, F.when(F.col(col_i).isNotNull(), F.col(col_i)).otherwise(0.0))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [128]:
for col_i in topic_count_columns:
    df = df.withColumn(col_i, F.when(F.col(col_i).isNotNull(), F.col(col_i)).otherwise(0.0))
    df = df.withColumn(col_i, F.log(F.col(col_i)+1))
    df = df.withColumn(col_i, 
                       (F.col(col_i)-scaling_topiccount_dict["mean"])/(2*scaling_topiccount_dict["std"]))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [129]:
validator(df.select("3_topic", "2_topiccount", "rp1_topicprop"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{}

### Saving in hdfs

In [130]:
if submission:
    print("Submission")
    final_hdfs_path = "hdfs:///submission-final-topics"
    df.write.option("header","true").csv(final_hdfs_path)
elif test:
    print("Test")
    final_hdfs_path = "hdfs:///test-final-topics"
    df.write.option("header","true").csv(final_hdfs_path)
elif training:
    print("Train")
    final_hdfs_path = "hdfs:///train-final-topics"
    df.write.option("header","true").csv(final_hdfs_path)
elif smaller_train:
    print("Smaller Train")
    final_hdfs_path = "hdfs:///smaller-train-final-topics"
    df.write.option("header","true").csv(final_hdfs_path)
else:
    print("Valid")
    final_hdfs_path = "hdfs:///val-final-topics"
    df.write.option("header","true").csv(final_hdfs_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Smaller Train

## Copy to hdfs

In [None]:
schema = StructType([(StructField("text_tokens_ors",StringType(),true),
                     StructField("tweet_id_id",StringType(),true),
                     StructField("engaged_with_user_id_id",StringType(),true),
                     StructField("engaged_with_user_is_verified_bool",BooleanType(),true),
                     StructField("engaging_user_id_id",StringType(),true),
                     StructField("engaging_user_is_verified_bool",BooleanType(),true),
                     StructField("engagee_follows_engager_bool",BooleanType(),true),
                     StructField("hashtagEncoded_unors",StringType(),true),
                     StructField("hashtagSumCount_ss_num",DoubleType(),true),
                     StructField("hashtagCount_ss_num",DoubleType(),true),
                     StructField("domainEncoded_unors",StringType(),true),
                     StructField("domainCount_ss_num",DoubleType(),true),
                     StructField("tweetEncoded_cat",IntegerType(),true),
                     StructField("languageEncoded_cat",StringType(),true),
                     StructField("tweet_timestamp_day_of_week_cat",StringType(),true),
                     StructField("tweet_timestamp_week_of_month_cat",StringType(),true),
                     StructField("tweet_timestamp_hour_cat",StringType(),true),
                     StructField("tweet_timestamp_to_engagee_account_creation_ss_num",DoubleType(),true),
                     StructField("tweet_timestamp_to_engaging_account_creation_ss_num",DoubleType(),true),
                     StructField("engaged_with_vs_engaging_follower_diff_log_ss_num",DoubleType(),true),
                     StructField("engaged_with_vs_engaging_following_diff_log_ss_num",DoubleType(),true),
                     StructField("engaged_follow_diff_log_ss_num",DoubleType(),true),
                     StructField("engaging_follow_diff_log_ss_num",DoubleType(),true),
                     StructField("engaged_follower_diff_engaging_following_log_ss_num",DoubleType(),true),
                     StructField("engaged_following_diff_engaging_follower_log_ss_num",DoubleType(),true),
                     StructField("engaged_with_user_follower_count_log_ss_num",DoubleType(),true),
                     StructField("engaging_user_follower_count_log_ss_num",DoubleType(),true),
                     StructField("engaged_with_user_following_count_log_ss_num",DoubleType(),true),
                     StructField("engaging_user_following_count_log_ss_num",DoubleType(),true),
                     StructField("PhotoCount_ss_num",DoubleType(),true),
                     StructField("VideoCount_ss_num",DoubleType(),true),
                     StructField("GIFCount_ss_num",DoubleType(),true),
                     StructField("linkCount_ss_num",DoubleType(),true),
                     StructField("engaged_with_user_follower_count_q_cat",DoubleType(),true),
                     StructField("engaged_with_user_following_count_q_cat",DoubleType(),true),
                     StructField("engaged_with_user_account_creation_q_cat",DoubleType(),true),
                     StructField("engaging_user_follower_count_q_cat",DoubleType(),true),
                     StructField("engaging_user_following_count_q_cat",DoubleType(),true),
                     StructField("engaging_user_account_creation_q_cat",DoubleType(),true),
                     StructField("total_appearance_ss_num",DoubleType(),true),
                     StructField("perc_n_interactions_ss_num",DoubleType(),true),
                     StructField("perc_n_commented_ss_num",DoubleType(),true),
                     StructField("perc_n_liked_ss_num",DoubleType(),true),
                     StructField("perc_n_replied_ss_num",DoubleType(),true),
                     StructField("perc_n_retweeted_ss_num",DoubleType(),true),
                     StructField("indicator_reply",IntegerType(),true),
                     StructField("indicator_retweet",IntegerType(),true),
                     StructField("indicator_retweet_with_comment",IntegerType(),true),
                     StructField("indicator_like",IntegerType(),true),
                     StructField("indicator_interaction",IntegerType(),true),
                     StructField("engaged_with_user_id_bucket",IntegerType(),true),
                     StructField("engaging_user_id_bucket",IntegerType(),true),
                     StructField("hash_engaging_user_id",IntegerType(),true),
                     StructField("rp0_topic",IntegerType(),true),
                     StructField("rp1_topic",IntegerType(),true),
                     StructField("rp2_topic",IntegerType(),true),
                     StructField("rp3_topic",IntegerType(),true),
                     StructField("rp4_topic",IntegerType(),true),
                     StructField("rp0_topiccount",DoubleType(),true),
                     StructField("rp1_topiccount",DoubleType(),true),
                     StructField("rp2_topiccount",DoubleType(),true),
                     StructField("rp3_topiccount",DoubleType(),true),
                     StructField("rp4_topiccount",DoubleType(),true),
                     StructField("rp0_topicprop",DoubleType(),true),
                     StructField("rp1_topicprop",DoubleType(),true),
                     StructField("rp2_topicprop",DoubleType(),true),
                     StructField("rp3_topicprop",DoubleType(),true),
                     StructField("rp4_topicprop",DoubleType(),true),
                     StructField("rt0_topic",IntegerType(),true),
                     StructField("rt1_topic",IntegerType(),true),
                     StructField("rt2_topic",IntegerType(),true),
                     StructField("rt3_topic",IntegerType(),true),
                     StructField("rt4_topic",IntegerType(),true),
                     StructField("rt0_topiccount",DoubleType(),true),
                     StructField("rt1_topiccount",DoubleType(),true),
                     StructField("rt2_topiccount",DoubleType(),true),
                     StructField("rt3_topiccount",DoubleType(),true),
                     StructField("rt4_topiccount",DoubleType(),true),
                     StructField("rt0_topicprop",DoubleType(),true),
                     StructField("rt1_topicprop",DoubleType(),true),
                     StructField("rt2_topicprop",DoubleType(),true),
                     StructField("rt3_topicprop",DoubleType(),true),
                     StructField("rt4_topicprop",DoubleType(),true),
                     StructField("rtc0_topic",IntegerType(),true),
                     StructField("rtc1_topic",IntegerType(),true),
                     StructField("rtc2_topic",IntegerType(),true),
                     StructField("rtc3_topic",IntegerType(),true),
                     StructField("rtc4_topic",IntegerType(),true),
                     StructField("rtc0_topiccount",DoubleType(),true),
                     StructField("rtc1_topiccount",DoubleType(),true),
                     StructField("rtc2_topiccount",DoubleType(),true),
                     StructField("rtc3_topiccount",DoubleType(),true),
                     StructField("rtc4_topiccount",DoubleType(),true),
                     StructField("rtc0_topicprop",DoubleType(),true),
                     StructField("rtc1_topicprop",DoubleType(),true),
                     StructField("rtc2_topicprop",DoubleType(),true),
                     StructField("rtc3_topicprop",DoubleType(),true),
                     StructField("rtc4_topicprop",DoubleType(),true),
                     StructField("lk0_topic",IntegerType(),true),
                     StructField("lk1_topic",IntegerType(),true),
                     StructField("lk2_topic",IntegerType(),true),
                     StructField("lk3_topic",IntegerType(),true),
                     StructField("lk4_topic",IntegerType(),true),
                     StructField("lk0_topiccount",DoubleType(),true),
                     StructField("lk1_topiccount",DoubleType(),true),
                     StructField("lk2_topiccount",DoubleType(),true),
                     StructField("lk3_topiccount",DoubleType(),true),
                     StructField("lk4_topiccount",DoubleType(),true),
                     StructField("lk0_topicprop",DoubleType(),true),
                     StructField("lk1_topicprop",DoubleType(),true),
                     StructField("lk2_topicprop",DoubleType(),true),
                     StructField("lk3_topicprop",DoubleType(),true),
                     StructField("lk4_topicprop",DoubleType(),true))])            

In [10]:
def build_processed_schema(has_labels=True):
    if has_labels:
        schema = StructType([StructField('embedding_ors', StringType()),
                             StructField('tweet_id_id', StringType()),
                             StructField('engaged_with_user_id_id', StringType()),
                             StructField('engaged_with_user_is_verified_bool', BooleanType()),
                             StructField('engaging_user_id_id', StringType()),
                             StructField('engaging_user_is_verified_bool', BooleanType()),
                             StructField('engagee_follows_engager_bool', BooleanType()),
                             StructField('hashtagEncoded_unors', StringType()),
                             StructField('hashtagSumCount_ss_num', DoubleType()),
                             StructField('hashtagCount_ss_num', DoubleType()),
                             StructField('domainEncoded_unors', StringType()),
                             StructField('domainCount_ss_num', DoubleType()),
                             StructField('tweetEncoded_cat', IntegerType()),
                             StructField('languageEncoded_cat', StringType()),
                             StructField('tweet_timestamp_day_of_week_cat', StringType()),
                             StructField('tweet_timestamp_week_of_month_cat', StringType()),
                             StructField('tweet_timestamp_hour_cat', StringType()),
                             StructField('tweet_timestamp_to_engagee_account_creation_ss_num', DoubleType()),
                             StructField('tweet_timestamp_to_engaging_account_creation_ss_num', DoubleType()),
                             StructField('engaged_with_vs_engaging_follower_diff_log_ss_num', DoubleType()), 
                             StructField('engaged_with_vs_engaging_following_diff_log_ss_num', DoubleType()),
                             StructField('engaged_follow_diff_log_ss_num', DoubleType()),
                             StructField('engaging_follow_diff_log_ss_num', DoubleType()),
                             StructField('engaged_follower_diff_engaging_following_log_ss_num', DoubleType()),
                             StructField('engaged_following_diff_engaging_follower_log_ss_num', DoubleType()),
                             StructField('engaged_with_user_follower_count_log_ss_num', DoubleType()),
                             StructField('engaging_user_follower_count_log_ss_num', DoubleType()),
                             StructField('engaged_with_user_following_count_log_ss_num', DoubleType()),
                             StructField('engaging_user_following_count_log_ss_num', DoubleType()),
                             StructField('PhotoCount_ss_num', DoubleType()),
                             StructField('VideoCount_ss_num', DoubleType()),
                             StructField('GIFCount_ss_num', DoubleType()),
                             StructField('linkCount_ss_num', DoubleType()),
                             StructField('engaged_with_user_follower_count_q_cat', DoubleType()),
                             StructField('engaged_with_user_following_count_q_cat', DoubleType()),
                             StructField('engaged_with_user_account_creation_q_cat', DoubleType()),
                             StructField('engaging_user_follower_count_q_cat', DoubleType()),
                             StructField('engaging_user_following_count_q_cat', DoubleType()),
                             StructField('engaging_user_account_creation_q_cat', DoubleType()),
                             StructField('total_appearance_ss_num', DoubleType()),
                             StructField('perc_n_interactions_ss_num', DoubleType()),
                             StructField('perc_n_commented_ss_num', DoubleType()),
                             StructField('perc_n_liked_ss_num', DoubleType()),
                             StructField('perc_n_replied_ss_num', DoubleType()),
                             StructField('perc_n_retweeted_ss_num', DoubleType()),
                             StructField('indicator_reply', IntegerType()),
                             StructField('indicator_retweet', IntegerType()),
                             StructField('indicator_retweet_with_comment', IntegerType()),
                             StructField('indicator_like', IntegerType()),
                             StructField('indicator_interaction', IntegerType()),
                             StructField('engaged_with_user_id_bucket', IntegerType()),
                             StructField('engaging_user_id_bucket', IntegerType()), 
                             StructField('cluster_cat', LongType())])
    else:
        schema = StructType([StructField('embedding_ors', StringType()),
                         StructField('tweet_id_id', StringType()),
                         StructField('engaged_with_user_id_id', StringType()),
                         StructField('engaged_with_user_is_verified_bool', BooleanType()),
                         StructField('engaging_user_id_id', StringType()),
                         StructField('engaging_user_is_verified_bool', BooleanType()),
                         StructField('engagee_follows_engager_bool', BooleanType()),
                         StructField('hashtagEncoded_unors', StringType()),
                         StructField('hashtagSumCount_ss_num', DoubleType()),
                         StructField('hashtagCount_ss_num', DoubleType()),
                         StructField('domainEncoded_unors', StringType()),
                         StructField('domainCount_ss_num', DoubleType()),
                         StructField('tweetEncoded_cat', IntegerType()),
                         StructField('languageEncoded_cat', StringType()),
                         StructField('tweet_timestamp_day_of_week_cat', StringType()),
                         StructField('tweet_timestamp_week_of_month_cat', StringType()),
                         StructField('tweet_timestamp_hour_cat', StringType()),
                         StructField('tweet_timestamp_to_engagee_account_creation_ss_num', DoubleType()),
                         StructField('tweet_timestamp_to_engaging_account_creation_ss_num', DoubleType()),
                         StructField('engaged_with_vs_engaging_follower_diff_log_ss_num', DoubleType()), 
                         StructField('engaged_with_vs_engaging_following_diff_log_ss_num', DoubleType()),
                         StructField('engaged_follow_diff_log_ss_num', DoubleType()),
                         StructField('engaging_follow_diff_log_ss_num', DoubleType()),
                         StructField('engaged_follower_diff_engaging_following_log_ss_num', DoubleType()),
                         StructField('engaged_following_diff_engaging_follower_log_ss_num', DoubleType()),
                         StructField('engaged_with_user_follower_count_log_ss_num', DoubleType()),
                         StructField('engaging_user_follower_count_log_ss_num', DoubleType()),
                         StructField('engaged_with_user_following_count_log_ss_num', DoubleType()),
                         StructField('engaging_user_following_count_log_ss_num', DoubleType()),
                         StructField('PhotoCount_ss_num', DoubleType()),
                         StructField('VideoCount_ss_num', DoubleType()),
                         StructField('GIFCount_ss_num', DoubleType()),
                         StructField('linkCount_ss_num', DoubleType()),
                         StructField('engaged_with_user_follower_count_q_cat', DoubleType()),
                         StructField('engaged_with_user_following_count_q_cat', DoubleType()),
                         StructField('engaged_with_user_account_creation_q_cat', DoubleType()),
                         StructField('engaging_user_follower_count_q_cat', DoubleType()),
                         StructField('engaging_user_following_count_q_cat', DoubleType()),
                         StructField('engaging_user_account_creation_q_cat', DoubleType()),
                         StructField('total_appearance_ss_num', DoubleType()),
                         StructField('perc_n_interactions_ss_num', DoubleType()),
                         StructField('perc_n_commented_ss_num', DoubleType()),
                         StructField('perc_n_liked_ss_num', DoubleType()),
                         StructField('perc_n_replied_ss_num', DoubleType()),
                         StructField('perc_n_retweeted_ss_num', DoubleType()),
                         StructField('engaged_with_user_id_bucket', IntegerType()),
                         StructField('engaging_user_id_bucket', IntegerType()), 
                         StructField('cluster_cat', LongType())])
    return schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…