# Setting Pyspark

In [52]:
import findspark
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark.sql.functions as F
import pyspark.sql.types as T

from configs.config import DATA_SOURCE_DIR, PROJECT_DIR
from pyspark.sql import SparkSession
from train.transforms.utils import *
from train.transforms.splitting import stratified_splitting
from train.transforms.outliers import log_transformation
from train.transforms.categorical_data import * 
from train.transforms.categorical_data import *
from train.transforms.correlation import *

In [53]:
findspark.init()
findspark.find()

spark= SparkSession \
       .builder \
       .appName("Feature engineering 1") \
       .getOrCreate()

In [54]:
df = spark.read.csv(str(DATA_SOURCE_DIR),header=True,escape="\"")

In [55]:
df = cast_incorrect_column_type(df)

# Handle outliers
- Performing log transformation on some feature columns to reduce skewness

In [56]:
df = log_transformation(df, target_cols=["person_age", "person_income", "person_emp_exp", "loan_amnt", "loan_percent_income", "cb_person_cred_hist_length"])

# Transforming categorical data

In [57]:
categorical_cols = find_categorical_cols(df)
pipeline = onehot_encoding_pipeline(df, categorical_cols)
df = pipeline.transform(df)

In [61]:
for c in categorical_cols:
    df_final = df.select(c + "_encoded").rdd.map(lambda row: row[c + "_encoded"].toArray()).collect()

SyntaxError: cannot assign to function call here. Maybe you meant '==' instead of '='? (2456638434.py, line 2)

In [60]:
df_final

[array([0.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([0.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([1.]),
 array([0.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([1.]),
 array([0.]),
 array([1.]),
 array([1.]),
 array([1.]),
 array([1.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([0.]),
 array([1.]),
 array([1.]),
 array([1.]),
 array([1.]),
 array([0.]),
 array

In [59]:
columns_to_drop = categorical_cols + [col + "_index" for col in categorical_cols]
df_final = df_final.drop(*columns_to_drop)

AttributeError: 'list' object has no attribute 'drop'

In [50]:
df_pandas = df_final.toPandas()

In [51]:
df_pandas.head(5)

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_encoded,person_education_encoded,person_home_ownership_encoded,loan_intent_encoded,previous_loan_defaults_on_file_encoded
0,3.135494,11.183713,0.0,10.463132,16.02,0.398776,1.386294,561.0,1,(0.0),"(0.0, 0.0, 0.0, 1.0)","(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0)",(0.0)
1,3.091042,9.415971,0.0,6.908755,11.14,0.076961,1.098612,504.0,0,(0.0),"(0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0)",(1.0)
2,3.258097,9.428592,1.386294,8.612685,12.87,0.364643,1.386294,635.0,1,(0.0),"(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)",(0.0)
3,3.178054,11.286702,0.0,10.463132,15.23,0.364643,1.098612,675.0,1,(0.0),"(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)",(0.0)
4,3.218876,11.099469,0.693147,10.463132,14.27,0.425268,1.609438,586.0,1,(1.0),"(0.0, 0.0, 0.0, 1.0)","(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)",(0.0)


# Deal with correlation
- Using unsupervised method like Variance Threshold for Feature Selection

# Stratified Splitting

In [9]:
train_df, test_df, val_df = stratified_splitting(
    df_pandas, train_size=0.8, test_size=0.1, val_size=0.1, random_state=42
)

Confirm the proportion of class

In [10]:
test_stratified_sampling(
    df_pandas, train_df, test_df, val_df
)

The proportion of class in each set are the same


  source_df_class_proportion = source_df['loan_status'].value_counts()[0] / source_df['loan_status'].value_counts()[1]
  source_df_class_proportion = source_df['loan_status'].value_counts()[0] / source_df['loan_status'].value_counts()[1]
  train_df_class_proportion = train_df['loan_status'].value_counts()[0] / train_df['loan_status'].value_counts()[1]
  train_df_class_proportion = train_df['loan_status'].value_counts()[0] / train_df['loan_status'].value_counts()[1]
  val_df_class_proportion = val_df['loan_status'].value_counts()[0] / val_df['loan_status'].value_counts()[1]
  val_df_class_proportion = val_df['loan_status'].value_counts()[0] / val_df['loan_status'].value_counts()[1]
  test_df_class_proportion = test_df['loan_status'].value_counts()[0] / test_df['loan_status'].value_counts()[1]
  test_df_class_proportion = test_df['loan_status'].value_counts()[0] / test_df['loan_status'].value_counts()[1]


# Save as HDFS file

In [12]:
save_to_parquet(train_df, str(PROJECT_DIR / "data" / "features" / "feature_engineering_1.h5"), key="train", mode="w")
save_to_parquet(test_df, str(PROJECT_DIR / "data" / "features" / "feature_engineering_1.h5"), key="test")
save_to_parquet(val_df, str(PROJECT_DIR / "data" / "features" / "feature_engineering_1.h5"), key="val")