In [5]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SparkContext

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.mllib.stat import Statistics
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import Imputer
from pyspark.sql.types import StringType, DoubleType, IntegerType

from pyspark.sql.functions import when, count, col

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
df_train = spark.read.csv("../input/home-credit-default-risk/application_train.csv",inferSchema="true", header="true")
df_test = spark.read.csv("../input/home-credit-default-risk/application_test.csv",inferSchema="true", header="true")
df_prevap = spark.read.csv("../input/home-credit-default-risk/previous_application.csv",inferSchema="true", header="true")

In [26]:
def getcolumns(df):
    str_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, StringType)]
    double_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, DoubleType)]
    int_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, IntegerType)]
    num_cols = double_cols + int_cols
    return str_cols, num_cols

In [31]:
def getfeatures(df_train, df_test, df_prevap):
    missing_60 = ['COMMONAREA_AVG', 'COMMONAREA_MODE', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG', 'FONDKAPREMONT_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAPARTMENTS_AVG', 'FLOORSMIN_MODE', 'FLOORSMIN_MEDI', 'FLOORSMIN_AVG', 'YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI', 'YEARS_BUILD_AVG', 'OWN_CAR_AGE']
    df_train = df_train.select(([c for c in df_train.columns if c not in missing_60]))
    df_test = df_test.select(([c for c in df_test.columns if c not in missing_60]))
    
    imputer = Imputer(inputCols=getcolumns(df_train)[1], outputCols=getcolumns(df_train)[1])
    model = imputer.fit(df_train)
    model.transform(df_train)
    
    imputer = Imputer(inputCols=getcolumns(df_test)[1], outputCols=getcolumns(df_test)[1])
    model = imputer.fit(df_test)
    model.transform(df_test)    
    
    dulp = list(set(df_prevap.columns) & set(df_train.columns))
    for x in dulp:
        df_prevap=df_prevap.withColumnRenamed(x, x + '_prev')
    df_train = df_train.join(df_prevap, df_train.SK_ID_CURR==df_prevap.SK_ID_CURR_prev)
    df_test = df_test.join(df_prevap, df_test.SK_ID_CURR==df_prevap.SK_ID_CURR_prev)
    
    return df_train, df_test
    

In [33]:
df_train, df_test = getfeatures(df_train, df_test, df_prevap)