## Let's begin: prepare your data

Start by importing the mammographic_masses.data.txt file into a Pandas dataframe (hint: use read_csv) and take a look at it.

In [1]:
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import logging
from src.logging import logger
logger.setLevel(logging.DEBUG)

In [4]:
dataset_name='mammographic'

In [5]:
from src.paths import raw_data_path, interim_data_path, processed_data_path


In [6]:
from src.data import RawDataset
mammo_data = RawDataset(dataset_name)
mammo_data.add_url(url="https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data")

In [7]:
mammo_data.add_url(url='https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.names',
                file_name=f'{dataset_name}.readme',
                name='DESCR')

Next you'll need to convert the Pandas dataframes into numpy arrays that can be used by scikit_learn. Create an array that extracts only the feature data we want to work with (age, shape, margin, and density) and another array that contains the classes (severity). You'll also need an array of the feature name labels.

Some of our models require the input data to be normalized, so go ahead and normalize the attribute data. Hint: use preprocessing.StandardScaler().

In [8]:
from src import workflow
from src.data.localdata import process_csv
mammo_data.load_function = process_csv

In [9]:
from src.data import Dataset
workflow.add_raw_dataset(mammo_data)
mammo_df = Dataset.from_raw(dataset_name, force=True)
print(str(mammo_df))

2018-11-17 13:23:36,578 - fetch - DEBUG - No file_name specified. Inferring mammographic_masses.data from URL
2018-11-17 13:23:36,580 - fetch - DEBUG - mammographic_masses.data exists, but no hash to check. Setting to sha1:5cfd64b52520391fb1f2d2d5d115d10c8c862046
2018-11-17 13:23:36,582 - fetch - DEBUG - mammographic.readme exists, but no hash to check. Setting to sha1:d8f3a7c205397d619eadfecf990dd84380115325
2018-11-17 13:23:36,583 - fetch - DEBUG - Copying mammographic_masses.data
2018-11-17 13:23:36,585 - fetch - DEBUG - Copying mammographic.readme
2018-11-17 13:23:36,588 - localdata - DEBUG - load_csv()-->loading csv file=/Users/mei/Documents/courses/bbconf/mammogram/data/interim/mammographic/mammographic_masses.data ...
2018-11-17 13:23:36,599 - datasets - DEBUG - Wrote Dataset Metadata: 252508f3d18124f88fc1204c8d0cb9c439dd032f.metadata
2018-11-17 13:23:36,601 - datasets - DEBUG - Wrote Dataset: 252508f3d18124f88fc1204c8d0cb9c439dd032f.dataset


<Dataset: mammographic, data.shape=(830, 4), target.shape=(830,), metadata=['descr', 'dataset_name', 'hash_type', 'data_hash', 'target_hash']>


In [10]:
workflow.available_datasets()
workflow.get_transformer_list()

[{'raw_dataset_name': 'mammographic',
  'transformations': [['train_test_split',
    {'random_state': 1, 'test_size': 0.25}]]}]

In [11]:
workflow.available_transformers()

['index_to_date_time', 'pivot', 'train_test_split']

In [12]:
transform_pipeline = [("train_test_split", {'random_state':1, 'test_size':0.25})]
workflow.add_transformer(from_raw=dataset_name,
                         suppress_output=True,
                         transformations=transform_pipeline)

In [13]:
workflow.make_data()
logger.setLevel(logging.DEBUG)

2018-11-17 13:23:36,843 - transform_data - DEBUG - Creating Dataset from Raw: mammographic with opts {}
2018-11-17 13:23:36,844 - datasets - DEBUG - process() called before unpack()
2018-11-17 13:23:36,845 - datasets - DEBUG - unpack() called before fetch()
2018-11-17 13:23:36,845 - fetch - DEBUG - No file_name specified. Inferring mammographic_masses.data from URL
2018-11-17 13:23:36,848 - fetch - DEBUG - mammographic_masses.data exists, but no hash to check. Setting to sha1:5cfd64b52520391fb1f2d2d5d115d10c8c862046
2018-11-17 13:23:36,850 - fetch - DEBUG - mammographic.readme exists, but no hash to check. Setting to sha1:d8f3a7c205397d619eadfecf990dd84380115325
2018-11-17 13:23:36,852 - fetch - DEBUG - Copying mammographic_masses.data
2018-11-17 13:23:36,854 - fetch - DEBUG - Copying mammographic.readme
2018-11-17 13:23:36,859 - datasets - DEBUG - Found cached Dataset for mammographic: 252508f3d18124f88fc1204c8d0cb9c439dd032f
2018-11-17 13:23:36,859 - transform_data - DEBUG - Applying

In [14]:
mammo_df.data

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [15]:
workflow.available_datasets()

['mammographic_test', 'mammographic_train']

In [16]:
ds_train = Dataset.load('mammographic_train')

In [17]:
ds_train.data.shape

(622, 4)

In [18]:
ds_train.target

array([1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [19]:
ds_test = Dataset.load('mammographic_test')

In [20]:
ds_test.data.shape

(208, 4)

add the **Linear Support Vector Classifier** from scikit-learn.

In [21]:
from sklearn.svm import LinearSVC

In [22]:
model = LinearSVC(random_state=42)

In [23]:
model.fit(ds_train.data, ds_train.target)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [24]:
model = LinearSVC(random_state=42, max_iter=200000)
model.fit(ds_train.data, ds_train.target)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

### Use the model to predict phoneme classes

In [25]:
lsvc_prediction = model.predict(ds_test.data);
lsvc_prediction[:20]

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1])

In [26]:
model.score(ds_test.data, ds_test.target)

0.7740384615384616

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
accuracy_score(ds_test.target, lsvc_prediction)

0.7740384615384616

### Train Models


In [29]:
workflow.available_algorithms()

['linearSVC',
 'gradientBoostingClassifier',
 'randomForestClassifier',
 'logisticRegression']

In [30]:
workflow.add_model(dataset_name='mammographic_train',
                   algorithm_name="linearSVC",
                   algorithm_params={'random_state': 42, 'max_iter': 200000})

In [31]:
workflow.add_model(
    dataset_name = 'mammographic_train',
    algorithm_name = 'gradientBoostingClassifier',
    algorithm_params = {'random_state': 42}    
)

In [32]:
workflow.add_model(
    dataset_name = 'mammographic_train',
    algorithm_name = 'randomForestClassifier',
    algorithm_params = {'random_state': 42, 'n_estimators': 10}  
)

In [33]:
workflow.get_model_list()

[{'algorithm_name': 'linearSVC',
  'algorithm_params': {'max_iter': 200000, 'random_state': 42},
  'dataset_name': 'mammographic_train',
  'run_number': 1},
 {'algorithm_name': 'gradientBoostingClassifier',
  'algorithm_params': {'random_state': 42},
  'dataset_name': 'mammographic_train',
  'run_number': 1},
 {'algorithm_name': 'randomForestClassifier',
  'algorithm_params': {'n_estimators': 10, 'random_state': 42},
  'dataset_name': 'mammographic_train',
  'run_number': 1}]

In [34]:
workflow.make_train()

2018-11-17 13:23:38,096 - model_list - INFO - Creating model: linearSVC_mammographic_train_1
2018-11-17 13:23:38,125 - utils - DEBUG - PROCESS_TIME:                         train_model    25.5 ms
2018-11-17 13:23:38,130 - model_list - INFO - Creating model: linearSVC_mammographic_train_1
2018-11-17 13:23:38,161 - utils - DEBUG - PROCESS_TIME:                         train_model    27.3 ms
2018-11-17 13:23:38,165 - model_list - INFO - Creating model: gradientBoostingClassifier_mammographic_train_1
2018-11-17 13:23:38,258 - utils - DEBUG - PROCESS_TIME:                         train_model    89.6 ms
2018-11-17 13:23:38,375 - model_list - INFO - Creating model: linearSVC_mammographic_train_1
2018-11-17 13:23:38,404 - utils - DEBUG - PROCESS_TIME:                         train_model    25.8 ms
2018-11-17 13:23:38,408 - model_list - INFO - Creating model: gradientBoostingClassifier_mammographic_train_1
2018-11-17 13:23:38,498 - utils - DEBUG - PROCESS_TIME:                         train_mod

{'linearSVC_mammographic_train_1': {'algorithm_name': 'linearSVC',
  'algorithm_params': {'C': 1.0,
   'class_weight': None,
   'dual': True,
   'fit_intercept': True,
   'intercept_scaling': 1,
   'loss': 'squared_hinge',
   'max_iter': 200000,
   'multi_class': 'ovr',
   'penalty': 'l2',
   'random_state': 42,
   'tol': 0.0001,
   'verbose': 0},
  'dataset_name': 'mammographic_train',
  'run_number': 1,
  'data_hash': '30d925e88b3e900c0b618753fb5975917001526e',
  'target_hash': '6302541ca7e5a33e32ae795c703bb59c4d1b4c16',
  'start_time': 1542421418.378386,
  'duration': 0.025784969329833984,
  'model_hash': '3ca53311b1153ad1512b722e16eca1bba6425eb0'},
 'gradientBoostingClassifier_mammographic_train_1': {'algorithm_name': 'gradientBoostingClassifier',
  'algorithm_params': {'criterion': 'friedman_mse',
   'init': None,
   'learning_rate': 0.1,
   'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'max_leaf_nodes': None,
   'min_impurity_decrease': 0.0,
   'min_impurity_

In [35]:
workflow.available_models()

['gradientBoostingClassifier_mammographic_train_1',
 'linearSVC_mammographic_train_1',
 'randomForestClassifier_mammographic_train_1']

In [36]:
# load up the trained model
from src.models.train import load_model

tm, tm_metadata = load_model(model_name='linearSVC_mammographic_train_1')
tm

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=200000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

### Predicting Phonemes

In [37]:
## Set up predictions using all of the available models
for model in workflow.available_models():
    workflow.add_prediction(
        dataset_name = 'mammographic_test',
        model_name = model,
        is_supervised = True,
        output_dataset = 'mammographic_test' + '_' + model
    )

In [38]:
workflow.get_prediction_list()

[{'dataset_name': 'mammographic_test',
  'is_supervised': True,
  'model_name': 'gradientBoostingClassifier_mammographic_train_1',
  'output_dataset': 'mammographic_test_gradientBoostingClassifier_mammographic_train_1'},
 {'dataset_name': 'mammographic_test',
  'is_supervised': True,
  'model_name': 'linearSVC_mammographic_train_1',
  'output_dataset': 'mammographic_test_linearSVC_mammographic_train_1'},
 {'dataset_name': 'mammographic_test',
  'is_supervised': True,
  'model_name': 'randomForestClassifier_mammographic_train_1',
  'output_dataset': 'mammographic_test_randomForestClassifier_mammographic_train_1'}]

In [39]:
workflow.make_predict()

2018-11-17 13:23:39,080 - predict - DEBUG - Predict: Applying gradientBoostingClassifier_mammographic_train_1 to mammographic_test
2018-11-17 13:23:39,082 - predict - INFO - Experiment has already been run. Returning Cached Result
2018-11-17 13:23:39,087 - predict - DEBUG - Predict: Applying linearSVC_mammographic_train_1 to mammographic_test
2018-11-17 13:23:39,089 - predict - INFO - Experiment has already been run. Returning Cached Result
2018-11-17 13:23:39,115 - predict - DEBUG - Predict: Applying randomForestClassifier_mammographic_train_1 to mammographic_test
2018-11-17 13:23:39,117 - predict - INFO - Experiment has already been run. Returning Cached Result


{'mammographic_test_gradientBoostingClassifier_mammographic_train_1': {'dataset_name': 'mammographic_test_gradientBoostingClassifier_mammographic_train_1',
  'hash_type': 'sha1',
  'data_hash': 'fd417f4d3049afba88401907d2dfb489f0114da2',
  'target_hash': '8e373abca080cbfc7e6be95259c97a3dd534fb10',
  'experiment': {'model_name': 'gradientBoostingClassifier_mammographic_train_1',
   'dataset_name': 'mammographic_test',
   'run_number': 1,
   'hash_type': 'sha1',
   'input_data_hash': '42fb6529ee8448155ced11692e185627583afa25',
   'input_target_hash': '8e373abca080cbfc7e6be95259c97a3dd534fb10',
   'model_hash': 'e5bac7a292e23466c3528e2e9b7798877d1b3ba0',
   'start_time': 1542420295.228299,
   'duration': 0.0006542205810546875}},
 'mammographic_test_linearSVC_mammographic_train_1': {'dataset_name': 'mammographic_test_linearSVC_mammographic_train_1',
  'hash_type': 'sha1',
  'data_hash': '4d7e7a3adc9e8924426b9579884abcdd8c202e77',
  'target_hash': '8e373abca080cbfc7e6be95259c97a3dd534fb10',

In [40]:
workflow.available_predictions()

['mammographic_test_gradientBoostingClassifier_mammographic_train_1',
 'mammographic_test_linearSVC_mammographic_train_1',
 'mammographic_test_randomForestClassifier_mammographic_train_1']

### Analyze the prediction and summary statistics

In [41]:
from src.paths import summary_path

In [42]:
workflow.available_scorers()

['accuracy_score']

In [43]:
workflow.available_analyses()

['score_predictions']

In [44]:
workflow.add_analysis(analysis_name='score_predictions')
workflow.get_analysis_list()

[{'analysis_name': 'score_predictions', 'analysis_params': {}}]

In [45]:
workflow.make_analysis()

2018-11-17 13:23:39,447 - analysis - INFO - Performing Analysis: score_predictions
2018-11-17 13:23:39,453 - analysis - INFO - Scoring: Applying accuracy_score to mammographic_test_gradientBoostingClassifier_mammographic_train_1
2018-11-17 13:23:39,460 - analysis - INFO - Scoring: Applying accuracy_score to mammographic_test_linearSVC_mammographic_train_1
2018-11-17 13:23:39,467 - analysis - INFO - Scoring: Applying accuracy_score to mammographic_test_randomForestClassifier_mammographic_train_1
2018-11-17 13:23:39,474 - analysis - INFO - Writing Analysis to score_predictions.csv


{'score_predictions.csv': {'analysis_name': 'score_predictions',
  'analysis_params': {'predictions_list': ['mammographic_test_gradientBoostingClassifier_mammographic_train_1',
    'mammographic_test_linearSVC_mammographic_train_1',
    'mammographic_test_randomForestClassifier_mammographic_train_1'],
   'score_list': ['accuracy_score']}}}

In [46]:
df = pd.read_csv(summary_path / 'score_predictions.csv')
df

Unnamed: 0,algorithm_name,dataset_name,model_name,run_number,score,score_name
0,gradientBoostingClassifier,mammographic_test,gradientBoostingClassifier_mammographic_train_1,1,0.75,accuracy_score
1,linearSVC,mammographic_test,linearSVC_mammographic_train_1,1,0.774038,accuracy_score
2,randomForestClassifier,mammographic_test,randomForestClassifier_mammographic_train_1,1,0.75,accuracy_score
