In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

# 用convert_dtypes()读取数据

In [2]:
train = pd.read_csv('./input/train.csv').convert_dtypes(convert_string=False)
test = pd.read_csv('./input/testA.csv').convert_dtypes(convert_string=False)

## 定义全局变量

# 先拼train和test 本来想封装一个函数用来拼接
因为似乎concat有个问题，会lose dtype?
复盘了一下M5 感觉M5-simple-fe，def merge-by-concat不太看得懂

### 防呆 这里+col: 'flag'，test的y('isDefault')设为-1

In [3]:
# train.shape  # (800000, 48) & len(train)=800000
# test.shape # (200000, 47)

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [5]:
TARGET = 'isDefault'
train['flag'] = 'train'
test['flag'] = 'test'
test[TARGET] = -1
df = pd.concat([train, test])
# df

In [6]:
# df.info()

### drop policycode, 把flag astype 'string

In [7]:
df.drop('policyCode',axis = 1, inplace=True)

In [8]:
df['flag'] = df['flag'].astype('string')

### issueDate

#### 这里如果string能读datetime吗? 

#### 这里有个全局变量startdate小心

In [9]:
#转化成时间格式，查了确实train的最小日期是2007-6-1，这里加了个issueDateDT,还行吧，就是日期会让数值特别大，

df['issueDate'] = pd.to_datetime(df['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 构造时间特征
df['issueDateDT'] = df['issueDate'].apply(lambda x: x-startdate).dt.days

In [10]:
pd.set_option('display.max_rows', None)

In [11]:
# df.dtypes   # issueDate datetime64[ns]  '<M8[ns]'

### employmentLength的 years删掉，10+ <1转

In [12]:
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])

df['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
df['employmentLength'].replace('< 1 year', '0 years', inplace=True)
df['employmentLength'] = df['employmentLength'].apply(employmentLength_to_int)

In [13]:
df['employmentLength'].value_counts(dropna=False).sort_index()

0.0      80226
1.0      65671
2.0      90565
3.0      80163
4.0      59818
5.0      62645
6.0      46582
7.0      44230
8.0      45168
9.0      37866
10.0    328525
NaN      58541
Name: employmentLength, dtype: int64

#### 这里float，转成Int8

In [14]:
df['employmentLength'] = df['employmentTitle'].astype('Int8')

In [15]:
# df['employmentTitle'].dtypes  # Int8

### earliesCreditLine 转datetime

#### 这个现在正常了，就是值有点多，不知道要怎么利用抽取信息做特征

之前用excel打开，会变成乱码

In [16]:
df['earliesCreditLine'].sample(10)

607373    Jan-2007
54171     Jun-1991
116320    Sep-1996
24492     Apr-1989
158107    Aug-1991
528601    Aug-2005
339666    Dec-1997
84668     Dec-2002
376532    May-2000
446844    Oct-2005
Name: earliesCreditLine, dtype: object

In [None]:
# dw 这里是提取年信息，我想利用原始信息，顺便还可以describe一下,暂时不用他的
df['earliesCreditLine'] = pd.to_datetime(df['earliesCreditLine'],format='%b-%Y')

In [None]:
pd.set_option('display.max_columns', None)
train.describe(include="all")

In [None]:
# df.info()

#### 压缩一下内存

In [None]:
import psutil

## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2)

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [None]:
# 这里改了一些 Int
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['Int16', 'Int32', 'Int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'Int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype('Int8')
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype('Int16')
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype('Int32')
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype('Int64')
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
print("{:>20}: {:>8}".format('Original df',sizeof_fmt(train.memory_usage(index=True).sum())))

In [None]:
df = reduce_mem_usage(df)

In [None]:
# 看下数据类型变没变
df.info()

In [None]:
df.head()

### categorical:  grade

#### 保存

In [None]:
df.to_csv('cf-df.csv')

In [None]:
import pickle
df.to_pickle('cf-df.pkl')

In [None]:
# df = pd.read_pickle('cf-df.pkl')

In [None]:
train = df[:800000]
test = df[800000:]

In [None]:
test.head()

In [None]:
train.to_pickle('cf-train.pkl')

In [None]:
test.to_pickle('cf-testa.pkl')

In [None]:
# cat_fea = ['grade', 'subGrade', 'employmentTitle', 'verificationStatus', \
#           'purpose', 'postCode', 'regionCode','applicationType','initialListStatus', 'title', 'policyCode']

In [None]:
# cols = ['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
#        'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
#        'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
#        'purpose', 'postCode', 'regionCode', 'dti', 'detlinquency_2years',
#        'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
#        'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
#        'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
#        'policyCode', 'n0', 'n1', 'n2', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9',
#        'n10', 'n11', 'n12', 'n13', 'n14']

In [None]:
# num_fea = [a for a in cols if a not in cat_fea]
# num_fea

In [None]:
# ## 定义了一个统计函数，方便后续信息统计
# def Sta_inf(data):
#     print('_min',np.min(data))
#     print('_max:',np.max(data))
#     print('_mean',np.mean(data))
#     print('_ptp',np.ptp(data))
#     print('_std',np.std(data))
#     print('_var',np.var(data))