# Split train & test sets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../data/features.csv')

In [3]:
df.head()

Unnamed: 0,YEAR_2014,YEAR_2015,YEAR_2016,YEAR_2017,YEAR_2018,EARLY_TRANSFER_N,EARLY_TRANSFER_Y,EARLY_CANX_LEFT_N,EARLY_CANX_LEFT_Y,EARLY_CANX_DEFER_N,...,NUM_MAY_TYPE_A,NUM_MAY_TYPE_B,NUM_MAY_TYPE_C,NUM_MAY_TYPE_D,NUM_MAY_TYPE_E,NUM_MAY_TYPE_S,NUM_MAY_TYPE_X,DAYS_TO_FIRST_CRM,PREV_NUM_CRM,RESULT
0,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,,,P
1,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,,,P
2,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,,,P
3,0,0,1,0,0,1,0,1,0,1,...,,,,,,,,,,P
4,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,,,F


In [4]:
'{:,}'.format(len(df))

'16,786'

In [5]:
df['RESULT'].value_counts()

P     13887
F      1924
0       789
F0      186
Name: RESULT, dtype: int64

In [6]:
subset = df[df['RESULT'].isin(['P', 'F'])]

In [7]:
'{:,}'.format(len(subset))

'15,811'

In [8]:
subset['RESULT'].value_counts()

P    13887
F     1924
Name: RESULT, dtype: int64

## Feature names

In [9]:
feature_names = list(subset.drop("RESULT", 1).columns)

In [10]:
len(feature_names)

888

In [11]:
feature_names[0]

'YEAR_2014'

In [12]:
with open('../data/feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

## Fill NAN

In [13]:
subset.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


## X and y

In [14]:
X = np.array(subset.drop("RESULT", 1)) # Feature Matrix

In [15]:
X.shape

(15811, 888)

In [16]:
# X[0]

In [17]:
y = np.array(subset['RESULT'])

In [18]:
y.shape

(15811,)

In [19]:
from collections import Counter

In [20]:
Counter(y)

Counter({'F': 1924, 'P': 13887})

In [36]:
1 - Counter(y)['F'] / float(len(y))

0.8783125672000506

In [22]:
yB = y == 'P'

In [23]:
Counter(yB)

Counter({False: 1924, True: 13887})

In [24]:
def get_X_y(dataframe):

    X = np.array(dataframe.drop("RESULT", 1)) # Feature Matrix
    
    y = np.array(dataframe['RESULT']) == 'P'
    
    return X, y

## Split

In [25]:
X, yB = get_X_y(subset)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, yB, test_size=0.20, random_state=42)

In [27]:
c = Counter(y_train)
print(c)
c[False] / float(len(y_train))

Counter({True: 11101, False: 1547})


0.12231182795698925

In [28]:
c = Counter(y_test)
print(c)
c[False] / float(len(y_test))

Counter({True: 2786, False: 377})


0.11919064179576351

## Save arrays

Save the arrays in binary format

In [29]:
np.save('../data/X_train.npy', X_train)

In [30]:
np.save('../data/X_test.npy', X_test)

In [31]:
np.save('../data/y_train.npy', y_train)

In [32]:
np.save('../data/y_test.npy', y_test)

## CV Modelling

In [33]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [34]:
scores = cross_val_score(RandomForestClassifier(), X, yB, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 1.00 (+/- 0.00)


In [35]:
scores = cross_val_score(LogisticRegression(), X, yB, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.88 (+/- 0.01)
