# Capstone Project

In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

In [2]:
# load data
random_state = 42
df = pd.read_csv('../data/train.csv')

In [3]:
# Show the first 5 rows of the dataset
print(df.head())

   ACTION  RESOURCE  MGR_ID  ROLE_ROLLUP_1  ROLE_ROLLUP_2  ROLE_DEPTNAME  \
0       1     39353   85475         117961         118300         123472   
1       1     17183    1540         117961         118343         123125   
2       1     36724   14457         118219         118220         117884   
3       1     36135    5396         117961         118343         119993   
4       1     42680    5905         117929         117930         119569   

   ROLE_TITLE  ROLE_FAMILY_DESC  ROLE_FAMILY  ROLE_CODE  
0      117905            117906       290919     117908  
1      118536            118536       308574     118539  
2      117879            267952        19721     117880  
3      118321            240983       290919     118322  
4      119323            123932        19793     119325  


In [4]:
# Show a quick description of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 10 columns):
ACTION              32769 non-null int64
RESOURCE            32769 non-null int64
MGR_ID              32769 non-null int64
ROLE_ROLLUP_1       32769 non-null int64
ROLE_ROLLUP_2       32769 non-null int64
ROLE_DEPTNAME       32769 non-null int64
ROLE_TITLE          32769 non-null int64
ROLE_FAMILY_DESC    32769 non-null int64
ROLE_FAMILY         32769 non-null int64
ROLE_CODE           32769 non-null int64
dtypes: int64(10)
memory usage: 2.5 MB


In [13]:
# find different number of categories for each feature
for attr in df:
    print('\n')
    print(df[attr].value_counts())



1    30872
0     1897
Name: ACTION, dtype: int64


4675      839
79092     484
25993     409
75078     409
3853      404
6977      299
75834     299
32270     295
42085     247
17308     239
1020      236
13878     220
42093     204
18418     192
7543      186
23921     167
278393    163
34924     161
79121     157
28149     137
18072     136
20364     135
39262     129
14354     127
23096     126
75901     115
15064     113
33054     108
20897     104
33642      99
         ... 
35046       1
92378       1
100413      1
79728       1
79792       1
36894       1
28635       1
59370       1
45057       1
30629       1
18462       1
38809       1
77905       1
42879       1
75631       1
39012       1
30565       1
18275       1
79693       1
97530       1
28793       1
18243       1
20290       1
26407       1
43202       1
35014       1
75535       1
30936       1
89856       1
16376       1
Name: RESOURCE, Length: 7518, dtype: int64


770       152
2270       99
2594       82
1350  

In [19]:
# extract target and features 
target = df['ACTION']
features = df.drop('ACTION', axis=1)

# Display first 5 rows of target
print(target.head())

# Display shape of target
print(target.shape)

# Display first 5 rows of features
print(features.head())

# Display shape of features
print(features.shape)

0    1
1    1
2    1
3    1
4    1
Name: ACTION, dtype: int64
(32769,)
   RESOURCE  MGR_ID  ROLE_ROLLUP_1  ROLE_ROLLUP_2  ROLE_DEPTNAME  ROLE_TITLE  \
0     39353   85475         117961         118300         123472      117905   
1     17183    1540         117961         118343         123125      118536   
2     36724   14457         118219         118220         117884      117879   
3     36135    5396         117961         118343         119993      118321   
4     42680    5905         117929         117930         119569      119323   

   ROLE_FAMILY_DESC  ROLE_FAMILY  ROLE_CODE  
0            117906       290919     117908  
1            118536       308574     118539  
2            267952        19721     117880  
3            240983       290919     118322  
4            123932        19793     119325  
(32769, 9)
