In [1]:
%matplotlib inline

In [2]:
from __future__ import absolute_import, print_function, division

In [3]:
import time
import pandas as pd
import numpy as np

In [38]:
# Чтение данных

def transform_dollars(income_str):
    if isinstance(income_str, basestring):
        return float(''.join(income_str[1:].split(',')))
    return income_str

train_df = pd.read_csv('data/logit_insurance.csv', index_col='INDEX')

test_df = pd.read_csv('data/logit_insurance_test.csv', index_col='INDEX')
del test_df['TARGET_FLAG']

for df in [train_df, test_df]:
    del df['TARGET_AMT']
    for column in ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM']:
        df[column] = df[column].apply(transform_dollars)

In [39]:
train_df.head()

Unnamed: 0_level_0,TARGET_FLAG,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,SEX,...,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CAR_AGE,URBANICITY
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,60.0,0,11.0,67349.0,No,0.0,z_No,M,...,14230.0,11,Minivan,yes,4461.0,2,No,3,18.0,Highly Urban/ Urban
2,0,0,43.0,0,11.0,91449.0,No,257252.0,z_No,M,...,14940.0,1,Minivan,yes,0.0,0,No,0,1.0,Highly Urban/ Urban
4,0,0,35.0,1,10.0,16039.0,No,124191.0,Yes,z_F,...,4010.0,4,z_SUV,no,38690.0,2,No,3,10.0,Highly Urban/ Urban
5,0,0,51.0,0,14.0,,No,306251.0,Yes,M,...,15440.0,7,Minivan,yes,0.0,0,No,0,6.0,Highly Urban/ Urban
6,0,0,50.0,0,,114986.0,No,243925.0,Yes,z_F,...,18000.0,1,z_SUV,no,19217.0,2,Yes,3,17.0,Highly Urban/ Urban


In [41]:
# Какие данные нам доступны и их типы

test_df.dtypes

KIDSDRIV        int64
AGE           float64
HOMEKIDS        int64
YOJ           float64
INCOME        float64
PARENT1        object
HOME_VAL      float64
MSTATUS        object
SEX            object
EDUCATION      object
JOB            object
TRAVTIME        int64
CAR_USE        object
BLUEBOOK      float64
TIF             int64
CAR_TYPE       object
RED_CAR        object
OLDCLAIM      float64
CLM_FREQ        int64
REVOKED        object
MVR_PTS         int64
CAR_AGE       float64
URBANICITY     object
dtype: object

In [42]:
# Превращает категориальные признаки в бинарные

train_df2 = pd.get_dummies(train_df, dummy_na=False)
test_df2 = pd.get_dummies(test_df, dummy_na=False)

In [43]:
test_df2.dtypes

KIDSDRIV                              int64
AGE                                 float64
HOMEKIDS                              int64
YOJ                                 float64
INCOME                              float64
HOME_VAL                            float64
TRAVTIME                              int64
BLUEBOOK                            float64
TIF                                   int64
OLDCLAIM                            float64
CLM_FREQ                              int64
MVR_PTS                               int64
CAR_AGE                             float64
PARENT1_No                          float64
PARENT1_Yes                         float64
MSTATUS_Yes                         float64
MSTATUS_z_No                        float64
SEX_M                               float64
SEX_z_F                             float64
EDUCATION_<High School              float64
EDUCATION_Bachelors                 float64
EDUCATION_Masters                   float64
EDUCATION_PhD                   

In [44]:
features = list(test_df2.columns)

features

['KIDSDRIV',
 'AGE',
 'HOMEKIDS',
 'YOJ',
 'INCOME',
 'HOME_VAL',
 'TRAVTIME',
 'BLUEBOOK',
 'TIF',
 'OLDCLAIM',
 'CLM_FREQ',
 'MVR_PTS',
 'CAR_AGE',
 'PARENT1_No',
 'PARENT1_Yes',
 'MSTATUS_Yes',
 'MSTATUS_z_No',
 'SEX_M',
 'SEX_z_F',
 'EDUCATION_<High School',
 'EDUCATION_Bachelors',
 'EDUCATION_Masters',
 'EDUCATION_PhD',
 'EDUCATION_z_High School',
 'JOB_Clerical',
 'JOB_Doctor',
 'JOB_Home Maker',
 'JOB_Lawyer',
 'JOB_Manager',
 'JOB_Professional',
 'JOB_Student',
 'JOB_z_Blue Collar',
 'CAR_USE_Commercial',
 'CAR_USE_Private',
 'CAR_TYPE_Minivan',
 'CAR_TYPE_Panel Truck',
 'CAR_TYPE_Pickup',
 'CAR_TYPE_Sports Car',
 'CAR_TYPE_Van',
 'CAR_TYPE_z_SUV',
 'RED_CAR_no',
 'RED_CAR_yes',
 'REVOKED_No',
 'REVOKED_Yes',
 'URBANICITY_Highly Urban/ Urban',
 'URBANICITY_z_Highly Rural/ Rural']

In [None]:
# Самое банальное удаление пропусков

for df in [train_df, test_df]:
    df.fillna(-999., inplace=True)

In [20]:
from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [34]:
est = Pipeline([
    ('scaler', StandardScaler()),
    ('est', LogisticRegression(n_jobs=-1)),
])

crossval_results = cross_val_score(est,
                                   train_df2[features], train_df2.TARGET_FLAG,
                                   scoring='roc_auc', cv=10)

est.fit(train_df2[features], train_df2.TARGET_FLAG)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [35]:
print("Cross validation results: {} +/- {}".format(
        crossval_results.mean(), crossval_results.std()))

Cross validation results: 0.808223118804 +/- 0.0165819358473


In [37]:
submission = pd.read_csv('data/Prob_outkey_random.csv', index_col='INDEX')
submission.P_TARGET_FLAG = est.predict_proba(test_df2[features])[:,1]

filename = 'predictions/my_submission_{}.csv'.format(time.strftime('%Y-%m-%d_%H:%M:%S'))
submission.to_csv(filename)

print("Submission filename:", filename)

Submission filename: my_submission_2016-09-18_21:05:13.csv
