In [1]:
import pandas as pd
import numpy as np

from scipy.stats import norm, binom, poisson

In [2]:
N = 10000
seed = 0

In [3]:
# Generate synthetic MOU
mou = norm.rvs(loc=40, scale=90, size=N)
mou[mou<0] = np.nan

# Generate synthetic MBOU
mbou = norm.rvs(loc=600, scale=300, size=N)
mbou[mbou<0] = 0

# Generate synthetic SOU
sou = poisson.rvs(mu=0.99, loc=0, size=N)

# Generate synthetic fl_aparelho
fl_aparelho = binom.rvs(n=1, p=0.4, size=N)

# Generate synthetic fl_4g_plano
fl_4g_plano = binom.rvs(n=1, p=0.7, size=N)

# Generate synthetic fl_3gplus
fl_3gplus = binom.rvs(n=1, p=0.8, size=N)

# Join dataset
data = {'mou': mou,
        'mbou': mbou,
        'sou': sou,
        'fl_aparelho': fl_aparelho,
        'fl_4g_plano': fl_4g_plano,
        'fl_3g_plus': fl_3gplus}

data = pd.DataFrame(data) # convirtiendolo a pandas dataframe

# Generate Spark DataFrame
data = sqlContext.createDataFrame(data)

In [4]:
data.show()

In [5]:
# Show data types
data.printSchema()

# Make simple describe of data
desc = data.describe()
desc.show()

In [6]:
from pyspark.ml.feature import VectorAssembler, PCA

# First assemble data
assembler = VectorAssembler(inputCols=['mbou','mou','sou'],
                            outputCol='features')
out = assembler.transform(data)

# Do PCA
pca = PCA(k=2,inputCol='features',outputCol='pca')
model = pca.fit(out)
pca_out = model.transform(out)

# Plot PCA
aux = pca_out.select('pca')
aux = aux.map(lambda x: {'pca1':x[0].array.item(0), 'pca2':x[0].array.item(1)})
aux = aux.toDF(schema=['pca1','pca2'])
display(aux)

In [7]:
from pyspark.sql.functions import percent_rank, ntile
from pyspark.sql.window import Window

w = Window.partitionBy().orderBy('mou')

In [8]:
print w

In [9]:
ptiles = data.select('mou', percent_rank().over(w).alias('ptile'))

# un ejemplo de percentil 99:
ptiles.select('mou', 'ptile').where('round(float(ptile), 10)>0.99').show(1)

In [10]:
np.arange(0, 110, 10)/float(100)

In [11]:
mou_perc = [] #empty list

ran = np.arange(0, 110, 10)/float(100)

for r in ran:
  mou_perc.append(ptiles.select('mou', 'ptile').where("round(float(ptile), 10)>='"+str(r)+"'").toPandas().ix[0,0]) # .ix[0,0] es el index , el primer elemento de la matriz resultante

perc_10 = pd.DataFrame({'mou':mou_perc,
                     'ptile': ran})


In [12]:
perc_10

In [13]:
mou_perc4 = [] #empty list

ran4 = np.arange(0, 110, 25)/float(100)

for r in ran4:
  mou_perc4.append(ptiles.select('mou', 'ptile').where("round(float(ptile), 10)>='"+str(r)+"'").toPandas().ix[0,0]) 
  # .ix[0,0] es el index , el primer elemento de la matriz resultante

quartiles = pd.DataFrame({'mou':mou_perc4,
                         'ptile': ran4})

quartiles

In [14]:
# función de Daniel.

import pandas as pd
import numpy as np

def get_ntiles(df, n=4, cols=None):
    
    ntiles = {}
    ran = map(lambda x: str(x)+'%',np.arange(1,n+1)/float(n)*100)
    
    if cols is None:
        cols = df.columns
    
    for c in cols:
        w = Window.partitionBy().orderBy(c)
        aux = df.select(ntile(n).over(w).alias('ntile'),c)
        ntiles.update({c:aux.groupby('ntile').max(c).toPandas().ix[:,1]})
    
    ntiles = pd.DataFrame(ntiles)
    ntiles[str(n)+'-tile'] = ran
    ntiles.set_index(str(n)+'-tile',inplace=True)
    #ntiles = ntiles.round(3)
    return ntiles

In [15]:
get_ntiles(data, 20, cols=['mou'])

In [16]:
import matplotlib.pyplot as plt
import numpy as np

data_pd = data.toPandas()
fig,ax = plt.subplots()
ax = plt.boxplot(data_pd['mbou'])
plt.ylim((-500,2000))

display(fig)

In [17]:
data_to_plot = [data_pd['mou'], data_pd['mbou']]

fig, ax = plt.subplots()

bp = plt.boxplot(data_to_plot, widths = 0.6)

for whisker in bp['whiskers']:
  whisker.set(color='#7570b3', linewidth=2)
  
for flier in bp['fliers']:
  flier.set(marker='o', color='#e7298a', alpha=0.5)
  
plt.ylim((-500, 2500))


display(fig)