# Machine Learning
## Packages for the day
pip install numpy
pip install scipy
pip install scikit-learn
pip install category_encoders
pip install matplotlib
pip install pandas


### Where to start?
The data we get is rarely the exact same as the data we will use as input to a model. 

In [1]:
#import our packages
#Our Packages
import numpy as np
import sklearn
import matplotlib
import pandas as pd

#import our data
clinical_df = pd.read_csv('https://raw.githubusercontent.com/fredhutchio/R_intro/master/extra/clinical.csv')

In [2]:
#what data is numeric? describe() gives basic summary stats of numeric info
clinical_df.describe()

Unnamed: 0,age_at_diagnosis,days_to_death,days_to_birth,days_to_last_follow_up,cigarettes_per_day,years_smoked,year_of_birth,year_of_death
count,6718.0,2187.0,6718.0,5714.0,1171.0,448.0,6662.0,1566.0
mean,22319.849658,878.250114,-22319.849658,976.824991,2.599307,39.964286,1947.560943,2006.213921
std,5077.709,1052.479872,5077.709,1045.375189,2.02349,12.233689,14.317107,4.487636
min,3982.0,0.0,-32872.0,-64.0,0.008219,8.0,1902.0,1990.0
25%,19191.25,274.0,-26001.5,345.0,1.369863,30.75,1937.0,2004.0
50%,22841.5,524.0,-22841.5,650.0,2.191781,40.0,1947.0,2007.0
75%,26001.5,1044.5,-19191.25,1259.0,3.287671,50.0,1957.0,2010.0
max,32872.0,10870.0,-3982.0,11252.0,40.0,63.0,1993.0,2014.0


In [3]:
clinical_df.head()

Unnamed: 0,primary_diagnosis,tumor_stage,age_at_diagnosis,vital_status,morphology,days_to_death,state,tissue_or_organ_of_origin,days_to_birth,site_of_resection_or_biopsy,days_to_last_follow_up,cigarettes_per_day,years_smoked,gender,year_of_birth,race,ethnicity,year_of_death,bcr_patient_barcode,disease
0,C34.1,stage ia,24477.0,dead,8070/3,371.0,live,C34.1,-24477.0,C34.1,,10.958904,,male,1936.0,white,not hispanic or latino,2004.0,TCGA-18-3406,LUSC
1,C34.1,stage ib,26615.0,dead,8070/3,136.0,live,C34.1,-26615.0,C34.1,,2.191781,,male,1931.0,asian,not hispanic or latino,2003.0,TCGA-18-3407,LUSC
2,C34.3,stage ib,28171.0,dead,8070/3,2304.0,live,C34.3,-28171.0,C34.3,2099.0,1.643836,,female,1927.0,white,not hispanic or latino,,TCGA-18-3408,LUSC
3,C34.1,stage ia,27154.0,alive,8083/3,,live,C34.1,-27154.0,C34.1,3747.0,1.09589,,male,1930.0,white,not hispanic or latino,,TCGA-18-3409,LUSC
4,C34.3,stage iib,29827.0,dead,8070/3,146.0,live,C34.3,-29827.0,C34.3,,,,male,1923.0,not reported,not reported,2004.0,TCGA-18-3410,LUSC


In [4]:
#another way: select where data types are a numpy number
numeric_data = clinical_df.select_dtypes(include=[np.number])
numeric_data.head()

Unnamed: 0,age_at_diagnosis,days_to_death,days_to_birth,days_to_last_follow_up,cigarettes_per_day,years_smoked,year_of_birth,year_of_death
0,24477.0,371.0,-24477.0,,10.958904,,1936.0,2004.0
1,26615.0,136.0,-26615.0,,2.191781,,1931.0,2003.0
2,28171.0,2304.0,-28171.0,2099.0,1.643836,,1927.0,
3,27154.0,,-27154.0,3747.0,1.09589,,1930.0,
4,29827.0,146.0,-29827.0,,,,1923.0,2004.0


### NaN (Not a Number) will bring many of your efforts to a grinding halt. Let's set these to some value instead


In [5]:
from sklearn.preprocessing import MinMaxScaler
    
#This is going to fail spectacularly
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(numeric_data)

In [6]:
#first we must do something about NaN's
filled_numeric_data = numeric_data.fillna(numeric_data.mean())
filled_numeric_data.head()

Unnamed: 0,age_at_diagnosis,days_to_death,days_to_birth,days_to_last_follow_up,cigarettes_per_day,years_smoked,year_of_birth,year_of_death
0,24477.0,371.0,-24477.0,976.824991,10.958904,39.964286,1936.0,2004.0
1,26615.0,136.0,-26615.0,976.824991,2.191781,39.964286,1931.0,2003.0
2,28171.0,2304.0,-28171.0,2099.0,1.643836,39.964286,1927.0,2006.213921
3,27154.0,878.250114,-27154.0,3747.0,1.09589,39.964286,1930.0,2006.213921
4,29827.0,146.0,-29827.0,976.824991,2.599307,39.964286,1923.0,2004.0


### Scaling values 

In [7]:
#scaling numeric values
#in order to keep our index, column names, etc we use .loc to directly update the cell values
filled_numeric_data.loc[:,:] = x_scaled
filled_numeric_data.head()


Unnamed: 0,age_at_diagnosis,days_to_death,days_to_birth,days_to_last_follow_up,cigarettes_per_day,years_smoked,year_of_birth,year_of_death
0,0.709415,0.034131,0.290585,,0.273823,,0.373626,0.583333
1,0.78342,0.012511,0.21658,,0.0546,,0.318681,0.541667
2,0.837279,0.21196,0.162721,0.191145,0.040899,,0.274725,
3,0.802077,,0.197923,0.33678,0.027197,,0.307692,
4,0.8946,0.013431,0.1054,,,,0.230769,0.583333


### Transforming Categorical Variables to One-Hot representations
This is done because learning and numeric representations of catagories (eg: nationality, favorite movie genre, etc) introduces ordinality to the categories which is often not intended. This has unintended consequences on training a model.
For a quick rundown on what ordinality means, look here: 
https://stats.idre.ucla.edu/other/mult-pkg/whatstat/what-is-the-difference-between-categorical-ordinal-and-interval-variables/



In [15]:
#on more recent versions of sklearn, you can use a onehotencoder directly
#from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

other_data = clinical_df.select_dtypes(exclude=[np.number])
encoder = ce.OneHotEncoder(handle_unknown='ignore')

encoder.fit(other_data['tumor_stage'])
print(encoder.category_mapping) #show the mapping from string to integer representation

one_hot_tumor_stages = encoder.transform(other_data['tumor_stage'])
one_hot_tumor_stages.describe()

[{'col': 'tumor_stage', 'mapping': stage ia         1
stage ib         2
stage iib        3
stage iiia       4
stage iv         5
stage iia        6
not reported     7
stage iiib       8
stage ii         9
stage i         10
stage iii       11
NaN             12
stage x         13
stage iiic      14
stage iva       15
stage iic       16
stage ivb       17
i/ii nos        18
stage 0         19
dtype: int64, 'data_type': dtype('O')}]


Unnamed: 0,tumor_stage_1,tumor_stage_2,tumor_stage_3,tumor_stage_4,tumor_stage_5,tumor_stage_6,tumor_stage_7,tumor_stage_8,tumor_stage_9,tumor_stage_10,tumor_stage_11,tumor_stage_12,tumor_stage_13,tumor_stage_14,tumor_stage_15,tumor_stage_16,tumor_stage_17,tumor_stage_18,tumor_stage_19
count,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0,6832.0
mean,0.04435,0.055913,0.079479,0.057231,0.046399,0.105972,0.402957,0.038056,0.038056,0.035861,0.038495,0.005708,0.001903,0.03308,0.003513,0.00966,0.000293,0.002049,0.001025
std,0.205887,0.229771,0.270505,0.2323,0.210364,0.307824,0.490528,0.191346,0.191346,0.185956,0.192403,0.075344,0.043583,0.178858,0.05917,0.097819,0.017108,0.045225,0.031995
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
df = pd.concat([other_data, one_hot_tumor_stages, filled_numeric_data])