## **Arrays inside columns**

In [1]:
import pandas as pd

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'
from pyspark import SparkConf
from pyspark.sql import SparkSession


conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [2]:
from pyspark.sql.functions import split
from pyspark.sql.functions import col
from pyspark.sql.types import *


trends = spark\
.read\
.option("header", "true")\
.csv("../data/df_pd_10000.csv") 

trends.printSchema()

root
 |-- cod_paisoalf: string (nullable = true)
 |-- cod_entalfa: string (nullable = true)
 |-- cod_persctpn: string (nullable = true)
 |-- cod_idcontra: string (nullable = true)
 |-- serie_saldos: string (nullable = true)
 |-- partition_id: string (nullable = true)
 |-- des_segmento_global: string (nullable = true)
 |-- des_segmento_global_agr: string (nullable = true)
 |-- des_segmento_global_particular_autonomo: string (nullable = true)
 |-- des_segmento_plan_uno: string (nullable = true)
 |-- digital_3_3: string (nullable = true)
 |-- movil_3_3: string (nullable = true)
 |-- edad: string (nullable = true)
 |-- antig_cuentas_mov_anyos: string (nullable = true)
 |-- num_nominas_12m: string (nullable = true)
 |-- num_meses_nomina_12m: string (nullable = true)
 |-- segmento_total_avg_12m: string (nullable = true)
 |-- trend_upwards: string (nullable = true)
 |-- trend_downwards: string (nullable = true)
 |-- mean_diff: string (nullable = true)
 |-- transactionality: string (nullable =

In [21]:
trends.select('serie_saldos').show(1, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
trends = trends.withColumn("serie_saldos", split(col("serie_saldos"), ",")\
                           .cast("array<long>"))

In [24]:
trends.select('serie_saldos').show(2, False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|serie_saldos                                                                                                                                                                                                                                                                                                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
trends.columns

['_c0', 'imp_sdopost']

##  **rolling average**

In [6]:
from pyspark.sql import functions as F, Window

In [7]:
# Order = -1
w = Window().partitionBy(*vars_key)\
.orderBy(F.col(trans_fields['trans_date']), F.col(trans_fields['cod_order']))
fictitious_trans_df = input_table_transactions_df.withColumn('aux_row_number', F.row_number().over(w)).where(F.col('aux_row_number') == 1).drop(F.col('aux_row_number'))

NameError: name 'vars_key' is not defined

## **plots**

In [8]:
import matplotlib.pyplot as plt
import numpy as np
series = get_top_series(trends, 'imp_sdopost', n = 1)
    
for serie in series:
    my_plot(serie, xlabel="Time (days)", ylabel="Balance", format_yaxis=True)

NameError: name 'get_top_series' is not defined

In [None]:
df.select(c).limit(n)

In [38]:
def get_top_series(df, c, n=10):
    """
    @Args:
        - df: dataframe to get the time series from
        - c: name of the column
        - n: number of elements to get
    @Computes:
        - numpy ndarray with the first n time series in the dataframe
    """
    
    return df.select(c).limit(n).toPandas()[c].values

In [41]:
def my_plot(y_values, x_values=None, xlabel=None, ylabel=None, format_xaxis=False, format_yaxis=False):
    """
    @Args:
        - y_values: Serie to plot
        - x_values: x-values of the serie to plot (default: None)
        - xlabel: label for the x-axis (default: None)
        - ylabel: label for the y-axis (default: None)
        - format_xaxis: True to print numbers with commas as thousands separators in the x-axis (default: False)
        - format_xaxis: True to print numbers with commas as thousands separators in the y-axis (default: False)
    @Computes:
        - plots the time series
    """
    
    plt.figure(figsize=[14, 4])
    if x_values != None:
        plt.plot(x_values, y_values)
    else:
        plt.plot(y_values)
    if xlabel != None:
        plt.xlabel(xlabel)
    if ylabel != None:
        plt.ylabel(ylabel)
    if format_xaxis:
        plt.gca().get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    if format_yaxis:
        plt.gca().get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    plt.tight_layout()
    plt.show()