# Feature Evaluation

We will add different features to the baseline model and see how they affect performance.

In [2]:
import pandas as pd
import GeneralModel as gm

In [3]:
# retrieves data
merged = pd.read_csv('../../DataPlus/feature_dataframe.csv')

This function will create the model's datframe and then train on it.

In [9]:
def df_and_model(df, cont_vars=['age'], cat_vars=['gleason'], algorithm='svm'):
    model_df = gm.prepare_df(df, cont_vars, cat_vars)
    gm.general_model(model_df, algorithm=algorithm, print_results=False)

## Baseline Model

In [10]:
df_and_model(merged)

# of Data Points: 392



F-score: 0.689
AUC: 0.744


In [11]:
df_and_model(merged, algorithm='rf')

# of Data Points: 392


  Feature   Weight
0     age  0.48094
1       7  0.51906


F-score: 0.62
AUC: 0.699


In [12]:
df_and_model(merged, algorithm='lr')

# of Data Points: 392


  Feature    Weight
0     age  0.539089
1       7 -2.434898


F-score: 0.68
AUC: 0.739


## Adding Features

### Education

In [13]:
df_and_model(merged, cat_vars=['edu_binary', 'gleason'], algorithm='rf')

# of Data Points: 392


             Feature    Weight
0                age  0.513879
1  No College Degree  0.034012
2                  7  0.452109


F-score: 0.61
AUC: 0.688


### Marital Status

In [14]:
df_and_model(merged, cat_vars=['marry_binary', 'gleason'], algorithm='rf')

# of Data Points: 390


       Feature    Weight
0          age  0.555770
1  Not Married  0.026372
2            7  0.417858


F-score: 0.638
AUC: 0.709


### Race

In [16]:
df_and_model(merged, cat_vars=['white_binary', 'gleason'], algorithm='rf')

# of Data Points: 392


  Feature    Weight
0     age  0.554665
1   White  0.021104
2       7  0.424232


F-score: 0.611
AUC: 0.693


### Doctor Respect

In [19]:
df_and_model(merged, cat_vars=['mdrespme', 'gleason'], algorithm='rf')

# of Data Points: 349


  Feature    Weight
0     age  0.564022
1       1  0.004740
2       2  0.010523
3       3  0.014336
4       4  0.024011
5       5  0.027565
6       7  0.354803


F-score: 0.625
AUC: 0.705


### Doctor Advice

In [20]:
df_and_model(merged, cat_vars=['Advice1', 'gleason'], algorithm='rf')

# of Data Points: 359


  Feature    Weight
0     age  0.519543
1      AR  0.008320
2      AS  0.010033
3     ASR  0.065730
4       R  0.019539
5       S  0.042543
6      SR  0.022415
7       7  0.311877


F-score: 0.602
AUC: 0.683


In [21]:
df_and_model(merged, cat_vars=['Advice1'], cont_vars=[], algorithm='rf')

# of Data Points: 359


  Feature    Weight
0      AR  0.040915
1      AS  0.147067
2     ASR  0.578227
3       R  0.033355
4       S  0.145579
5      SR  0.054856


F-score: 0.436
AUC: 0.61


### Dataset

In [22]:
df_and_model(merged, cat_vars=['dataset', 'gleason'], algorithm='rf')

# of Data Points: 392


  Feature    Weight
0     age  0.529629
1      va  0.069167
2       7  0.401204


F-score: 0.642
AUC: 0.708


### Without Baseline Features

In [30]:
df_and_model(merged, cat_vars=['edu_binary', 'marry_binary', 'white_binary', 'dataset', 'Advice1'], 
             cont_vars=[], algorithm='rf')

# of Data Points: 358


             Feature    Weight
0  No College Degree  0.076940
1        Not Married  0.055142
2              White  0.063698
3                 va  0.176458
4                 AR  0.017206
5                 AS  0.028914
6                ASR  0.109518
7                  R  0.138727
8                  S  0.144127
9                 SR  0.189270


F-score: 0.681
AUC: 0.745


## Without Advice

In [31]:
df_and_model(merged, cat_vars=['edu_binary', 'marry_binary', 'white_binary', 'dataset'], cont_vars=[], algorithm='rf')

# of Data Points: 390


             Feature    Weight
0  No College Degree  0.225938
1        Not Married  0.193699
2              White  0.158999
3                 va  0.421364


F-score: nan
AUC: 0.495


Find out how to do feature importance with categorical variables and graph them