# Split train & test sets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../data/features.csv')

In [3]:
df.head()

Unnamed: 0,YEAR_2014,YEAR_2015,YEAR_2016,YEAR_2017,YEAR_2018,EARLY_TRANSFER_N,EARLY_TRANSFER_Y,EARLY_CANX_LEFT_N,EARLY_CANX_LEFT_Y,EARLY_CANX_DEFER_N,...,NUM_MAY_TYPE_C,NUM_MAY_TYPE_D,NUM_MAY_TYPE_E,NUM_MAY_TYPE_S,NUM_MAY_TYPE_X,DAYS_TO_FIRST_CRM,PREV_NUM_CRM,RESULT,Precision`Mark,Overal`Mark
0,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,P,71.4,71.0
1,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,P,68.0,68.0
2,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,P,57.96,58.0
3,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,P,63.58,64.0
4,1,0,0,0,0,1,0,1,0,1,...,,,,,,,,P,62.46,62.0


In [4]:
'{:,}'.format(len(df))

'16,799'

In [5]:
df['RESULT'].value_counts()

P     13898
F      1925
0       790
F0      186
Name: RESULT, dtype: int64

In [6]:
pd.cut(df['Precision`Mark'], bins=range(int(df['Precision`Mark'].min()), int(df['Precision`Mark'].max()), 10), 
             include_lowest=True).value_counts(dropna=False)

(50.0, 60.0]      5741
(60.0, 70.0]      5533
(40.0, 50.0]      1974
NaN               1174
(70.0, 80.0]      1125
(30.0, 40.0]       603
(20.0, 30.0]       277
(80.0, 90.0]       127
(-0.001, 10.0]     126
(10.0, 20.0]       119
Name: Precision`Mark, dtype: int64

## Feature names

In [7]:
feature_names = list(df.drop(
    ["RESULT", "Precision`Mark", "Overal`Mark"], 1).columns)

In [8]:
len(feature_names)

888

In [9]:
feature_names[0]

'YEAR_2014'

In [10]:
with open('../data/feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

## Fill NAN

In [11]:
df.fillna(0, inplace=True)

## X and y

In [12]:
X = np.array(df.drop(["RESULT", "Precision`Mark", "Overal`Mark"], 1)) # Feature Matrix

In [13]:
X.shape

(16799, 888)

In [14]:
# X[0]

In [15]:
y = np.array(df["Precision`Mark"])

In [16]:
y.shape

(16799,)

In [17]:
def get_X_y(dataframe):

    X = np.array(dataframe.drop(["RESULT", "Precision`Mark", "Overal`Mark"], 1)) # Feature Matrix
    
    y = np.array(dataframe["Precision`Mark"])
    
    return X, y

## Split

In [18]:
X, y = get_X_y(df)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Save arrays

Save the arrays in binary format

In [20]:
np.save('../data/X_train.npy', X_train)

In [21]:
np.save('../data/X_test.npy', X_test)

In [22]:
np.save('../data/y_train.npy', y_train)

In [23]:
np.save('../data/y_test.npy', y_test)

## CV Modelling

In [24]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

Passing as in getting a Precision Mark greater or equals to 40%:

In [25]:
yB = y >= 40

In [26]:
scores = cross_val_score(RandomForestClassifier(), X, yB, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.96 (+/- 0.00)


In [27]:
scores = cross_val_score(LogisticRegression(), X, yB, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.86 (+/- 0.01)
