In [1]:
import numpy as np
import pandas as pd

### Import the data and turn the columns to lowercase to make them easier to work with

In [2]:
train = pd.read_csv('data/application_train.csv')

In [3]:
train.columns = map(str.lower, train.columns)

In [4]:
test = pd.read_csv('data/application_test.csv')

In [5]:
test.columns = map(str.lower, test.columns)

### Create copies of the data to use for the first model

In [6]:
train_1 = train.copy()
test_1 = test.copy()

In [7]:
train_1.shape

(307511, 122)

In [8]:
len(train_1)

307511

### Getting rid of the extra categorical values from the train dataset
                          (see EDA workbook)

#### First, find the indexes

In [10]:
delete_from_columns = ['code_gender', 'name_income_type', 'name_family_status']
values_to_delete = ['XNA', 'Maternity leave', 'Unknown']
train_indexes_to_delete = []
test_indexes_to_delete = []
for i, e in zip(delete_from_columns, values_to_delete):
    bad_train_data = train_1[i] == e
    train_indexes_to_delete.extend(train_1.index[bad_train_data].tolist())
    
    bad_test_data = test_1[i] == e
    test_indexes_to_delete.extend(test_1.index[bad_test_data].tolist())
print(f'Train indexes: {train_indexes_to_delete}')
print(f'Test indexes: {test_indexes_to_delete}')


Train indexes: [35657, 38566, 83382, 189640, 48949, 109612, 218269, 291432, 295458, 41982, 187348]


#### Check that we have the right rows
##### Note 'XNA', 'Maternity leave', and 'Unknown'

In [12]:
train_1.iloc[train_indexes_to_delete].loc[:,['code_gender', 'name_income_type', 'name_family_status']]

Unnamed: 0,code_gender,name_income_type,name_family_status
35657,XNA,Working,Married
38566,XNA,Working,Married
83382,XNA,Working,Married
189640,XNA,Commercial associate,Civil marriage
48949,M,Maternity leave,Married
109612,F,Maternity leave,Married
218269,F,Maternity leave,Married
291432,F,Maternity leave,Married
295458,F,Maternity leave,Married
41982,M,Commercial associate,Unknown


In [13]:
train_1.drop(train_indexes_to_delete, inplace=True)

### Getting rid of all the rows that have more than 50% null values

In [14]:
null_threshold = 0.5

column_list = train_1.isnull().sum(axis = 0) / train_1.count(axis = 0)

train_1 = train_1.loc[:,list(column_list.apply(lambda x: x < null_threshold))]

#column_list.apply(lambda x: x < 0.5).head(50)

### Dropping null values from the columns with categorical data

In [15]:
object_columns = [column for column in train_1.columns if train_1[column].dtype=='object']

# train_copy_1 = train_copy_1.loc[train_copy_1[object_columns].notnull()]
train_1.dropna(subset=object_columns, inplace=True)
# train_copy_1['amt_req_credit_bureau_hour'].value_counts()

In [16]:
train_1.shape

(210207, 73)

In [17]:
test_1.shape

(48744, 121)

In [18]:
# train_copy_1.dropna(inplace=True)

In [19]:
# train_copy_1.shape

In [20]:
# Create a list of the target values to use as labels
labels = train_1['target'].values

In [21]:
len(labels)

210207

In [22]:
# Need to make the columns the same in the training and testing datasets
train_columns = list(train_1.columns)
test_columns = list(test_1.columns)

In [23]:
for column in train_columns:
    if column not in test_columns:
#         if column == 'target':
#             pass
#         else:
        print(f'Dropped from train: {column}')
        train_1.drop(column, axis=1, inplace=True)
        
for column in test_columns:
    if column not in train_columns:
        print(f'Dropped from test: {column}')
        test_1.drop(column, axis=1, inplace=True)

Dropped from train: target
Dropped from test: own_car_age
Dropped from test: ext_source_1
Dropped from test: apartments_avg
Dropped from test: basementarea_avg
Dropped from test: years_beginexpluatation_avg
Dropped from test: years_build_avg
Dropped from test: commonarea_avg
Dropped from test: elevators_avg
Dropped from test: entrances_avg
Dropped from test: floorsmax_avg
Dropped from test: floorsmin_avg
Dropped from test: landarea_avg
Dropped from test: livingapartments_avg
Dropped from test: livingarea_avg
Dropped from test: nonlivingapartments_avg
Dropped from test: nonlivingarea_avg
Dropped from test: apartments_mode
Dropped from test: basementarea_mode
Dropped from test: years_beginexpluatation_mode
Dropped from test: years_build_mode
Dropped from test: commonarea_mode
Dropped from test: elevators_mode
Dropped from test: entrances_mode
Dropped from test: floorsmax_mode
Dropped from test: floorsmin_mode
Dropped from test: landarea_mode
Dropped from test: livingapartments_mode
Dropp

In [24]:
features = list(train_1.columns)
len(features)

72

In [25]:
train_1.shape

(210207, 72)

In [26]:
test_1.shape

(48744, 72)

In [27]:
train_1 = pd.get_dummies(train_1)

In [28]:
test_1 = pd.get_dummies(test_1)

In [29]:
train_1.shape

(210207, 179)

In [30]:
test_1.shape

(48744, 181)

In [31]:
train_columns2 = list(train_1.columns)
test_columns2 = list(test_1.columns)
for column in train_columns2:
    if column not in test_columns2:
#         if column == 'target':
#             pass
#         else:
        print(f'Dropped from train: {column}')
        train_1.drop(column, axis=1, inplace=True)
        
for column in test_columns2:
    if column not in train_columns2:
        print(f'Dropped from test: {column}')
        test_1.drop(column, axis=1, inplace=True)

Dropped from test: name_income_type_Unemployed
Dropped from test: organization_type_XNA


In [None]:
train_1.shape

In [None]:
test_1.shape

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train_copy_1)
imputer.transform(train_copy_1)

imputer.fit(test_copy_1)
imputer.transform(test_copy_1)

In [None]:
# # For each column in the training dataset
# for column in train.columns:
# # If the column contains the object (categorical) classes
#     if train[column].dtype == 'object':
# # Create a set of numerical labels to represent the categories
#         label_encoder.fit(train[column])
#     # Then replace all of the categories with numerical labels
#         train[column] = label_encoder.transform(str(train[column]))
#     # and then (create separate columns??) I don't know what the binary matrix is for...
#         train[column] = to_categorical(train[column])
# # And do the same thing to the test set so that the columns are the same
#         test[column] = label_encoder.transform(test[column])
#         test[column] = to_categorical(test[column])
        
        
        
# #     print(column.title())
# #     print(train[column].dtype)

In [None]:
train_copy_1.head()

In [None]:
train_copy_1.dtypes.value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
scaler.fit(train_copy_1)

In [None]:
scaler.transform(train_copy_1)
scaler.transform(test_copy_1)

In [None]:
train_copy_1.shape

In [None]:
test_copy_1.shape

In [None]:
train_copy_1.head()

In [None]:
test_copy_1.head()

In [None]:
test_copy_1.shape

In [None]:
# test_copy_1.dropna(inplace=True)
test_copy_1.isnull().sum()

In [None]:
print('Column  |   Nulls')
for column in list(test_copy_1.columns):
    if test_copy_1[column].isnull().sum() > 0:
        print(column,  '|',  test_copy_1[column].isnull().sum())

In [None]:
# Need to deal with these nulls in the test data
test_copy_1.fillna(test_copy_1.mean(), inplace=True)

#Rerun above script to check
print('Column  |   Nulls')
for column in list(test_copy_1.columns):
    if test_copy_1[column].isnull().sum() > 0:
        print(column,  '|',  test_copy_1[column].isnull().sum())

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LRM = LogisticRegression()

In [None]:
print(len(train_copy_1))
print(len(labels))

In [None]:
LRM.fit(train_copy_1, labels)

In [None]:
LRM_prediction = LRM.predict_proba(test_copy_1)[:, 1]

In [None]:
solution_1 = test_copy_1[['sk_id_curr']]
solution_1['target'] = LRM_prediction

solution_1.head()
solution_1['target'].value_counts()