In [32]:
# -*- encoding:urf-8 -*-
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table like and matrices
import pandas as pd
import numpy as np

# Modeling Helper
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno

# Configure visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

enterprise_path = 'input/1entbase.csv'
alter_path = 'input/2alter.csv'
branch_path = 'input/3branch.csv'
invest_path = 'input/4invest.csv'
right_path = 'input/5right.csv'
project_path = 'input/6project.csv'
lawsuit_path = 'input/7lawsuit.csv'
breakfaith_path = 'input/8breakfaith.csv'
recruit_path = 'input/9recruit.csv'
train_path = 'input/train.csv'
test_path = 'input/evaluation_public.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
enterprise = pd.read_csv(enterprise_path)
invest = pd.read_csv(invest_path)
lawsuit = pd.read_csv(lawsuit_path)
breakfaith = pd.read_csv(breakfaith_path)
recruit = pd.read_csv(recruit_path)

In [33]:
if len(np.intersect1d(test.EID.values,train.EID.values))==0 : print "train and test is distinct" 
print "train size is {}, test size is {}".format(train.shape,test.shape)

train and test is distinct
train size is (153006, 2), test size is (102124, 1)


### Helper Functions

In [41]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = titanic.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

#### 训练集中的0,1分布

In [34]:
train.TARGET.value_counts()

0    123914
1     29092
Name: TARGET, dtype: int64

#### 观测项目和被执行数据
##### 观测项目

In [39]:
breakfaith

Unnamed: 0,EID,TYPECODE,FBDATE,SXENDDATE
0,10751,1270862,2015/3/1,
1,10751,1271420,2015/3/1,
2,10751,66052879,2015/3/1,
3,10751,66062576,2015/3/1,
4,26642,289401,2014/8/1,
5,26642,83521583,2014/8/1,
6,28204,536301,2014/9/1,
7,28204,81307437,2014/9/1,
8,66085,812594,2014/12/1,
9,66085,91060948,2014/12/1,


In [40]:
lawsuit

Unnamed: 0,EID,TYPECODE,LAWDATE,LAWAMOUNT
0,5986,104115771,2015-07-01,2700
1,5986,83486760,2014-06-01,88500
2,5986,76450675,2014-02-01,1202100
3,5986,97776391,2014-06-01,88500
4,5986,85054730,2014-02-01,1202100
5,10644,61087058,2012-12-01,90000
6,10751,66079746,2012-09-01,0
7,10751,60815833,2013-02-01,258500
8,12168,76452868,2014-03-01,41100
9,12168,81336086,2014-03-01,41100


In [37]:
train = pd.merge(train, enterprise, on='EID',how='left')
test = pd.merge(test, enterprise, on='EID',how='left')
# train = pd.merge(train, breakfaith, on='EID',how='left')
# test = pd.merge(test, breakfaith, on='EID',how='left')
train

Unnamed: 0,EID,TARGET,RGYEAR,HY,ZCZB,ETYPE,MPNUM,INUM,FINZB,FSTINUM,TZINUM
0,309,0,2001,87,10.0,17,,2.0,,0.0,
1,356,0,2011,50,100.0,7,,1.0,,0.0,
2,383,0,1999,43,3.0,17,1.0,2.0,,1.0,
3,399,0,2011,75,50.0,7,1.0,1.0,,0.0,
4,619,0,2008,74,200.0,7,,2.0,,0.0,
5,724,0,2007,51,3.0,8,,1.0,,0.0,
6,926,1,2015,75,500.0,7,6.0,2.0,,5.0,
7,977,0,2015,79,50.0,7,4.0,2.0,,2.0,
8,1330,0,2002,80,30.0,17,1.0,2.0,,1.0,
9,1382,1,2014,72,2000.0,7,4.0,3.0,1000.000000,0.0,


In [31]:
test

Unnamed: 0,EID,RGYEAR,HY,ZCZB,ETYPE,MPNUM,INUM,FINZB,FSTINUM,TZINUM,TYPECODE,FBDATE,SXENDDATE
0,350,2010,75,100.0,6,6.0,2.0,20000.00,26.0,,,,
1,562,2015,74,100.0,7,5.0,2.0,,5.0,,,,
2,632,2014,75,100.0,7,2.0,3.0,,22.0,,,,
3,750,2012,52,200.0,17,1.0,1.0,,0.0,,,,
4,823,2014,72,100.0,7,2.0,2.0,,2.0,,,,
5,876,2013,83,10.0,16,,2.0,,0.0,,,,
6,987,1996,51,30.0,7,,4.0,,0.0,,,,
7,1048,2013,72,1000.0,7,,2.0,,1.0,,,,
8,1098,2014,51,7200.0,7,2.0,3.0,,4.0,,,,
9,1609,1999,51,3.0,17,,3.0,,0.0,,,,
