# Features

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_path = '../../data/'

## Read data

In [3]:
df = pd.read_csv(os.path.join(data_path, 'SF_Datathon_Data.csv'), index_col='id')

In [4]:
df.head()

Unnamed: 0_level_0,Name,Age,Gender,Qualification_type,Job_Type,Race,GPA,Interviewed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Jody Tree,29,M,3,Marketing,White,3.7,True
2,Rowan Adan,59,M,2,Marketing,Latino,1.0,False
3,Caryl MacCall,56,M,1,Training,Other,4.0,False
4,Gordy Wornham,51,M,2,Business Development,Black,3.0,False
5,Abraham Feldfisher,46,M,1,Support,Latino,1.0,True


In [5]:
df.columns

Index(['Name', 'Age', 'Gender', 'Qualification_type', 'Job_Type', 'Race',
       'GPA', 'Interviewed'],
      dtype='object')

In [6]:
df.dtypes

Name                   object
Age                     int64
Gender                 object
Qualification_type      int64
Job_Type               object
Race                   object
GPA                   float64
Interviewed              bool
dtype: object

In [7]:
numeric_cols = [
    'Age',
    'GPA',
]

In [8]:
categorical_cols = [
    'Gender', 
    'Qualification_type', 
    'Job_Type', 
    'Race',
]

In [9]:
target_cols = [
    'Interviewed'
]

In [10]:
df_features = pd.DataFrame()

In [11]:
# Numeric
for col_name in numeric_cols:
    
    print('Number: {}'.format(col_name))
    
    df_features[col_name] = df[col_name]

Number: Age
Number: GPA


In [12]:
# Categories
for col_name in categorical_cols:
    
    print('Category: {}'.format(col_name))
        
    ## One-hot encoding
        
    # Categories
    df_categories = pd.get_dummies(df[col_name], prefix=col_name)

    # Append to features
    df_features = pd.concat([df_features, df_categories], axis=1)

Category: Gender
Category: Qualification_type
Category: Job_Type
Category: Race


In [13]:
# Target
for col_name in target_cols:
    
    print('Target: {}'.format(col_name))
    
    df_features[col_name] = df[col_name]

Target: Interviewed


In [14]:
len(df_features.columns)

28

In [15]:
list(df_features.columns)

['Age',
 'GPA',
 'Gender_F',
 'Gender_M',
 'Gender_X',
 'Qualification_type_1',
 'Qualification_type_2',
 'Qualification_type_3',
 'Qualification_type_4',
 'Qualification_type_5',
 'Job_Type_Accounting',
 'Job_Type_Business Development',
 'Job_Type_Engineering',
 'Job_Type_Human Resources',
 'Job_Type_Legal',
 'Job_Type_Marketing',
 'Job_Type_Product Management',
 'Job_Type_Research and Development',
 'Job_Type_Sales',
 'Job_Type_Services',
 'Job_Type_Support',
 'Job_Type_Training',
 'Race_Asian',
 'Race_Black',
 'Race_Latino',
 'Race_Other',
 'Race_White',
 'Interviewed']

In [16]:
df_features.head()

Unnamed: 0_level_0,Age,GPA,Gender_F,Gender_M,Gender_X,Qualification_type_1,Qualification_type_2,Qualification_type_3,Qualification_type_4,Qualification_type_5,...,Job_Type_Sales,Job_Type_Services,Job_Type_Support,Job_Type_Training,Race_Asian,Race_Black,Race_Latino,Race_Other,Race_White,Interviewed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,29,3.7,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,True
2,59,1.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,False
3,56,4.0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,False
4,51,3.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,False
5,46,1.0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,True


In [17]:
df_features.iloc[0, :]

Age                                    29
GPA                                   3.7
Gender_F                                0
Gender_M                                1
Gender_X                                0
Qualification_type_1                    0
Qualification_type_2                    0
Qualification_type_3                    1
Qualification_type_4                    0
Qualification_type_5                    0
Job_Type_Accounting                     0
Job_Type_Business Development           0
Job_Type_Engineering                    0
Job_Type_Human Resources                0
Job_Type_Legal                          0
Job_Type_Marketing                      1
Job_Type_Product Management             0
Job_Type_Research and Development       0
Job_Type_Sales                          0
Job_Type_Services                       0
Job_Type_Support                        0
Job_Type_Training                       0
Race_Asian                              0
Race_Black                        

In [18]:
with open('../../data/features.csv', 'w') as f:
    f.write(df_features.to_csv(index=False))

In [19]:
!wc -l '../../data/features.csv'

7001 ../../data/features.csv


In [20]:
!head -2 '../../data/features.csv'

Age,GPA,Gender_F,Gender_M,Gender_X,Qualification_type_1,Qualification_type_2,Qualification_type_3,Qualification_type_4,Qualification_type_5,Job_Type_Accounting,Job_Type_Business Development,Job_Type_Engineering,Job_Type_Human Resources,Job_Type_Legal,Job_Type_Marketing,Job_Type_Product Management,Job_Type_Research and Development,Job_Type_Sales,Job_Type_Services,Job_Type_Support,Job_Type_Training,Race_Asian,Race_Black,Race_Latino,Race_Other,Race_White,Interviewed
29,3.7,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,True
