# Scikit-learn pipeline

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer

ds = pd.read_csv('titanic.csv')
features = ds.drop(['survived', 'alive'], axis=1)

empty_space = FunctionTransformer(
    lambda x: x.replace(r'\s+', np.nan, regex=True), validate=False)
df2dict = FunctionTransformer(
    lambda x: x.to_dict(orient='records'), validate=False)

pl = Pipeline([
    ('empty_space', empty_space),
    ('to_dict', df2dict),
    ('dv', DictVectorizer(sparse=False)),
    ('na', Imputer(strategy='most_frequent')),
    ('gbt', GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.02, random_state=1, max_depth=3))
])

cv = cross_val_score(pl, features, ds.survived, cv=3, scoring='roc_auc')
cv.mean(), cv.std()

  return f(*args, **kwds)


(0.8579350362126407, 0.019895577141853242)

# Spark ML GBT pipeline

In [7]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

sdf = ss.read.csv('titanic.csv', header=True)

numCols = [
    'pclass', 'age', 'sibsp',
    'parch', 'fare', 'alone'
]

for col in numCols:
    sdf = sdf.withColumn(
        col, sdf[col].astype('decimal'))
    
sdf = sdf.withColumn(
    'survived', sdf['survived'].astype('int'))

categoricalCols =[
    'sex', 'embarked', 'class',
    'deck', 'who', 'embark_town'
]

indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=col+'Idx',
        handleInvalid='skip')
    for col in categoricalCols
]

idxCols = [col+'Idx' for col in categoricalCols]

assembler = VectorAssembler(
    inputCols=idxCols + numCols,
    outputCol="features")

cl = GBTClassifier(
    labelCol="survived",
    maxIter=100, maxDepth=3, stepSize=0.02)

pl = Pipeline(stages=indexers + [assembler, cl])

sdf_fna = sdf.fillna(0).replace('', 'NA')
train_df, test_df = sdf_fna.randomSplit([0.7, 0.3])

m = pl.fit(train_df)

predictions = m.transform(test_df)

evaluator = BinaryClassificationEvaluator(
    labelCol="survived",
    rawPredictionCol="prediction",
    metricName="areaUnderROC"
)

evaluator.evaluate(predictions)

0.7666914143288964

# Spark ML LR pileline

In [4]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

sdf = ss.read.csv('titanic.csv', header=True)

numCols = [
    'pclass', 'age', 'sibsp',
    'parch', 'fare', 'alone'
]

for col in numCols:
    sdf = sdf.withColumn(
        col, sdf[col].astype('decimal'))
    
sdf = sdf.withColumn(
    'survived', sdf['survived'].astype('int'))

categoricalCols =[
    'sex', 'embarked', 'class',
    'deck', 'who', 'embark_town'
]

indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=col+'Idx',
        handleInvalid='skip')
    for col in categoricalCols
]

encoders = [
    OneHotEncoder(
        inputCol=col+'Idx',
        outputCol=col+'Bin')
    for col in categoricalCols
]

encCols = [col+'Bin' for col in categoricalCols]

assembler = VectorAssembler(
    inputCols=encCols + numCols,
    outputCol="features")

cl = LogisticRegression(labelCol="survived", maxIter=10, regParam=0.1)

pl = Pipeline(stages=indexers + encoders + [assembler, cl])

sdf_fna = sdf.fillna(0).replace('', 'NA')
train_df, test_df = sdf_fna.randomSplit([0.7, 0.3])

m = pl.fit(train_df)

predictions = m.transform(test_df)

evaluator = BinaryClassificationEvaluator(
    labelCol="survived",
    rawPredictionCol="prediction",
    metricName="areaUnderROC"
)

evaluator.evaluate(predictions)

0.7967857958437999

# Logistic regression feature importace for polynomial features

In [10]:
import numpy as np
import pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, PolynomialExpansion
from pyspark.ml.pipeline import Pipeline

from pyspark.sql import SparkSession
ss = SparkSession.builder.config('spark.driver.bindAddress', '127.0.0.1').getOrCreate()

sdf = ss.read.csv('titanic.csv', header=True)

# only 2-category features can be used without binarization
categoricalCols =['sex'] #,'embarked', 'class', 'deck', 'who', 'embark_town']

indexers = [
    StringIndexer(inputCol=col, outputCol=col+'Idx', handleInvalid='skip')
    for col in categoricalCols
]

idxCols = [col+'Idx' for col in categoricalCols]

numCols = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone']

for col in numCols:
    sdf = sdf.withColumn(col, sdf[col].astype('decimal'))
    
sdf = sdf.withColumn('survived', sdf['survived'].astype('int'))

assembler = VectorAssembler(
    inputCols=idxCols + numCols,
    outputCol="features")

pe = PolynomialExpansion(degree=2, inputCol='features', outputCol='features_p')

cl = LogisticRegression(featuresCol='features_p', labelCol="survived", maxIter=10, regParam=0.1)

pl = Pipeline(stages=indexers + [assembler, pe, cl])

m = pl.fit(sdf.fillna(0).replace('', 'NA'))

import pandas as pd

fnames = idxCols + numCols
pnames = [
    n+'*'+n2
    for i, n in zip(range(len(fnames)), fnames)
    for n2 in (['1']+fnames)[:i+2]]

weights = m.stages[-1].coefficients.array

pd.DataFrame(
    {'weights': weights, 'importance': np.abs(weights), 'names': pnames}
).sort_values('importance', ascending=False)[:10]

Unnamed: 0,importance,names,weights
0,0.746541,sexIdx*1,0.746541
1,0.746541,sexIdx*sexIdx,0.746541
2,0.190673,pclass*1,-0.190673
14,0.164902,parch*1,0.164902
3,0.109082,pclass*sexIdx,0.109082
15,0.080076,parch*sexIdx,-0.080076
10,0.0755,sibsp*sexIdx,-0.0755
4,0.06725,pclass*pclass,-0.06725
18,0.04098,parch*sibsp,-0.04098
11,0.031343,sibsp*pclass,-0.031343


# String Indexer output

In [20]:
from pyspark.ml.feature import StringIndexer
import numpy as np
from pyspark.ml.linalg import Vectors

from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

si = StringIndexer(inputCol='in', outputCol='out')

rows = [
    {'in': 'm'},
    {'in': 'm'},
    {'in': 'f'},
    {'in': 'f'},
    {'in': 'm'},
]

df = ss.createDataFrame(rows)

si.fit(df).transform(df).toPandas()

Unnamed: 0,in,out
0,m,0.0
1,m,0.0
2,f,1.0
3,f,1.0
4,m,0.0


# Polynomial features order

In [23]:
from pyspark.ml.feature import PolynomialExpansion
import numpy as np
from pyspark.ml.linalg import Vectors

from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

pe = PolynomialExpansion(degree=2, inputCol='in', outputCol='out')

rows = [
    {'in': Vectors.dense([2, 10, 20])},
]

df = ss.createDataFrame(rows)

pe.transform(df).collect()[0].out.toArray()

array([   2.,    4.,   10.,   20.,  100.,   20.,   40.,  200.,  400.])