In [2]:
import pandas as pd
import numpy as np
import numbers, sklearn
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm, decomposition, preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc


In [3]:
TRAIN, TEST = 0, 1

print(sklearn.__version__)

0.17


In [4]:
print('Reading CSV Files...')
train = pd.read_csv('train_users_2.csv', header=0)
test = pd.read_csv('test_users.csv')
filled = [train.copy(), test.copy()]

Reading CSV Files...


In [5]:
print('Cleaning Data: Converting all entries to numerical type, filling NaNs, etc...')
le = preprocessing.LabelEncoder()
for i, df in enumerate(filled):
    for feature in [col for col in df.columns if col not in ['id', 'country_destination']]: #Excluding id
        if df[feature].dtype not in [int, float]:
            if 'date' in feature:
                df[feature] = pd.to_datetime(df[feature])
                df[feature] = (df[feature] - df[feature].min()) / np.timedelta64(1,'D')
            else:
                mapping = {value: i for i, value in enumerate([_ for _ in df[feature].unique() if _ is not np.nan])}
                mapping_full = lambda x: mapping[x] if x not in ['-unknown-', np.nan] else np.nan
                df[feature] = df[feature].map(mapping_full)
                #print(df[feature])
                #df[feature] = le.fit_transform(df[feature])
    df = df.fillna(-1)

Cleaning Data: Converting all entries to numerical type, filling NaNs, etc...
0         20090319043255
1         20090523174809
2         20090609231247
3         20091031060129
4         20091208061105
5         20100101215619
6         20100102012558
7         20100103191905
8         20100104004211
9         20100104023758
10        20100104194251
11        20100105051812
12        20100105060859
13        20100105083259
14        20100107055820
15        20100107204555
16        20100107215125
17        20100107224625
18        20100108015641
19        20100110010817
20        20100110152120
21        20100110220941
22        20100111031438
23        20100111224015
24        20100111230808
25        20100112131444
26        20100112155420
27        20100112205949
28        20100113044650
29        20100113064333
               ...      
213421    20140630231137
213422    20140630231246
213423    20140630231548
213424    20140630231859
213425    20140630232119
213426    201406302323

TypeError: unorderable types: str() > float()

In [None]:
print('Dropping columns of data entirely missing on at least one set...')
for col in filled[TEST].columns:
    if any(df[col].isnull().all() for df in filled):
        for i, df in enumerate(filled):
            filled[i] = filled[i].drop(col, axis=1)

In [None]:
print('Isolating train/test sets...')
X_train, y_train = filled[TRAIN].ix[:, 1:-1], filled[TRAIN].ix[:, -1]
X_test = filled[TEST].ix[:, 1:]

In [None]:
#model = LogisticRegression()
#model = Classifier(
#    layers=[
#        Layer("Rectifier", units=100),
#        Layer("Linear")],
#    learning_rate=0.02,
#    n_iter=10)
model = RandomForestClassifier(n_estimators=20)
model.fit(X_train, y_train)

print('Analyzing Logistic Regression...')
Disbursed_lg = model.predict_proba(X_test)
fpr, tpr, _ = roc_curve(model.predict(X_test), Disbursed_lg[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)

In [None]:
print('Making prediction...')
y_test = pd.DataFrame(model.predict(X_test))
to_output = pd.concat([filled[TEST]['id'], y_test], axis=1)

In [None]:
print('Writing prediction to submission file...')
to_output.to_csv('submission.csv', index=False, header=['id', 'country'])