In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
filr_test = os.path.join(os.getcwd(),'data','test.csv')
df_test = pd.read_csv(filr_test)

In [3]:
#df_test.info()
problem_cols = ['Gender','Dependents','Self_Employed','LoanAmount','Loan_Amount_Term','Credit_History']

In [4]:
gender_binary = []
for x in df_test['Gender'].values:
    if x == 'Male':
        gender_binary.append(1)
    elif x == 'Female':
        gender_binary.append(0)
    else:
        gender_binary.append(x)

In [5]:
from sklearn.preprocessing import Imputer

In [6]:
impute_mode = Imputer(strategy='most_frequent')
impute_mean = Imputer(strategy='mean')
impute_median = Imputer(strategy='median')

In [7]:
gender_imputed = impute_mode.fit_transform(np.array(gender_binary).reshape(-1,1))

In [8]:
df_test = df_test.assign(Gender_Imputed = gender_imputed)
df_test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'Gender_Imputed'],
      dtype='object')

In [9]:
dependents = []
for item in list(df_test['Dependents'].values):
    if item == '3+':
        dependents.append(3)
    elif item == '0':
        dependents.append(0)
    elif item == '1':
        dependents.append(1)
    elif item == '2':
        dependents.append(2)
    else:
        dependents.append(item)

In [10]:
impute_median = Imputer(strategy='median')
deps_impute_1 = impute_mode.fit_transform(np.array(dependents).reshape(-1,1))
deps_impute_1[100:]
deps_impute_2 = []

for x in deps_impute_1:
    if x == 0.00:
        deps_impute_2.append('0')
    elif x == 1.00:
        deps_impute_2.append('1')
    elif x == 2.00:
        deps_impute_2.append('2')
    else:
        deps_impute_2.append('3+')

In [11]:
df_test = df_test.assign(Dependent_Imputed = deps_impute_2)
df_test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Imputed',
       'Dependent_Imputed'],
      dtype='object')

In [12]:
df_test['Self_Employed']
employment_binary = []
for x in df_test['Self_Employed'].values:
    if x == 'Yes':
        employment_binary.append(1)
    elif x == 'No':
        employment_binary.append(0)
    else:
        employment_binary.append(x)

employment_imputed = impute_mode.fit_transform(np.array(employment_binary).reshape(-1,1))

In [13]:
df_test = df_test.assign(Self_Employed_Imputed = employment_imputed)
df_test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Imputed',
       'Dependent_Imputed', 'Self_Employed_Imputed'],
      dtype='object')

In [14]:
loan_amount_imputed = impute_mean.fit_transform(df_test['LoanAmount'].values.reshape(-1,1))

In [15]:
df_test = df_test.assign(LoanAmount_Imputed = loan_amount_imputed)
df_test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Imputed',
       'Dependent_Imputed', 'Self_Employed_Imputed', 'LoanAmount_Imputed'],
      dtype='object')

In [16]:
loan_amount_term_imputed = impute_mode.fit_transform(df_test['Loan_Amount_Term'].values.reshape(-1,1))
df_test = df_test.assign(Loan_Amount_Term_Imputed = loan_amount_term_imputed)
df_test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Imputed',
       'Dependent_Imputed', 'Self_Employed_Imputed', 'LoanAmount_Imputed',
       'Loan_Amount_Term_Imputed'],
      dtype='object')

In [17]:
credit_history_imputed = impute_mode.fit_transform(df_test['Credit_History'].values.reshape(-1,1))
df_test = df_test.assign(Credit_History_Imputed = credit_history_imputed )
df_test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Imputed',
       'Dependent_Imputed', 'Self_Employed_Imputed', 'LoanAmount_Imputed',
       'Loan_Amount_Term_Imputed', 'Credit_History_Imputed'],
      dtype='object')

In [18]:
cols_of_interest =[
    'Loan_ID','Gender_Imputed', 'Married','Dependent_Imputed', 'Education',
       'Self_Employed_Imputed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount_Imputed',
       'Loan_Amount_Term_Imputed','Credit_History_Imputed', 'Property_Area'
]

cols_final = ['Loan_ID','Gender', 'Married','Dependent', 'Education',
              'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
              'Loan_Amount_Term','Credit_History', 'Property_Area'
]

In [19]:
final_np = df_test[cols_of_interest].values

In [20]:
final_df = pd.DataFrame(final_np,columns=cols_final)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Loan_ID              367 non-null object
Gender               367 non-null object
Married              367 non-null object
Dependent            367 non-null object
Education            367 non-null object
Self_Employed        367 non-null object
ApplicantIncome      367 non-null object
CoapplicantIncome    367 non-null object
LoanAmount           367 non-null object
Loan_Amount_Term     367 non-null object
Credit_History       367 non-null object
Property_Area        367 non-null object
dtypes: object(12)
memory usage: 34.5+ KB


In [21]:
final_df.to_csv(os.path.join(os.getcwd(),'data','test_2.csv'),index = False)