https://civisanalytics.com/blog/data-science/2015/12/17/workflows-in-python-getting-data-ready-to-build-models/

Data source: https://www.drivendata.org/competitions/7/page/25/

<ol>
<li>steps</li>
<li>steps</li>
<li>steps</li>
</ol>

In [16]:
import pandas as pd
import numpy as np

features_df = pd.DataFrame.from_csv("well_data.csv") # features data
labels_df   = pd.DataFrame.from_csv("well_labels.csv") # labels
print( labels_df.head(4) )

         status_group
id                   
69572      functional
8776       functional
34310      functional
67743  non functional


In [17]:
# list all the unique entries of the labels
labels_df['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'], dtype=object)

In [18]:
#function to map the labels from string to ints
def label_map(y):
   if y=="functional":
       return 2
   elif y=="functional needs repair":
       return 1
   else:
       return 0
labels_df = labels_df.applymap(label_map) # map labels

#test
print( labels_df.head() )

       status_group
id                 
69572             2
8776              2
34310             2
67743             0
19728             2


In [19]:
# Check that the labels have been encoded to ints 0,1,2
labels_df['status_group'].unique()

array([2, 0, 1])

In [60]:
%matplotlib inline
labels_df.groupby('status_group').size()#.plot()

status_group
0    22824
1     4317
2    32259
dtype: int64

In [20]:
features_df.head(2)

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
8776,0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [21]:
#features_df.columns.map(lambda x: print(x)) # one way of printing the list of features line by line
features_df.columns

Index(['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward',
       'population', 'public_meeting', 'recorded_by', 'scheme_management',
       'scheme_name', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')

In [22]:
#set(features_df['funder'].tolist()) # to find all the unique entries in a column
## example to show how dictionay is populated
#unique_values = np.array(['zack', 'happy', 'funny', 'panda']).tolist()
#unique_values
#transformer_dict = {}
#for ii, value in enumerate(unique_values):
#    transformer_dict[value] = ii

In [23]:
# This transforms the categorical features into ints
# NOTE: This is NOT one-hot-encoding
def transform_feature( df, column_name ):
    
    unique_values = set( df[column_name].tolist() ) # find unique entries in column
    # init dict & populate dic, assign int to each elem in unique_values
    transformer_dict = {}
    for ii, value in enumerate(unique_values):
        transformer_dict[value] = ii

    def label_map(y):
        return transformer_dict[y]
    df[column_name] = df[column_name].apply( label_map )
    return df

### list of column names indicating which columns to transform; 
### this is just a start!  Use some of the print( labels_df.head() )
### output upstream to help you decide which columns get the
### transformation
names_of_columns_to_transform = ["funder", "installer", "wpt_name", "basin", "subvillage",
                    "region", "lga", "ward", "public_meeting", "recorded_by",
                    "scheme_management", "scheme_name", "permit",
                    "extraction_type", "extraction_type_group",
                    "extraction_type_class",
                    "management", "management_group",
                    "payment", "payment_type",
                    "water_quality", "quality_group", "quantity", "quantity_group",
                    "source", "source_type", "source_class",
                    "waterpoint_type", "waterpoint_type_group"]
for column in names_of_columns_to_transform:
    features_df = transform_feature( features_df, column )
    
print( features_df.head() )

       amount_tsh date_recorded  funder  gps_height  installer  longitude  \
id                                                                          
69572        6000    2011-03-14    1308        1390       1475  34.938093   
8776            0    2013-03-06     908        1399       1735  34.698766   
34310          25    2013-02-25     583         686        103  37.460664   
67743           0    2013-01-28     652         263       1072  38.486161   
19728           0    2011-07-13    1105           0       1232  31.130847   

        latitude  wpt_name  num_private  basin          ...            \
id                                                      ...             
69572  -9.856322     27889            0      8          ...             
8776   -2.147466     12883            0      0          ...             
34310  -3.821329      1444            0      3          ...             
67743 -11.155298     25502            0      2          ...             
19728  -1.825359     2

In [24]:
### remove the "date_recorded" column--we're not going to make use
### of time-series data today
features_df.drop("date_recorded", axis=1, inplace=True)

print(features_df.columns.values)

['amount_tsh' 'funder' 'gps_height' 'installer' 'longitude' 'latitude'
 'wpt_name' 'num_private' 'basin' 'subvillage' 'region' 'region_code'
 'district_code' 'lga' 'ward' 'population' 'public_meeting' 'recorded_by'
 'scheme_management' 'scheme_name' 'permit' 'construction_year'
 'extraction_type' 'extraction_type_group' 'extraction_type_class'
 'management' 'management_group' 'payment' 'payment_type' 'water_quality'
 'quality_group' 'quantity' 'quantity_group' 'source' 'source_type'
 'source_class' 'waterpoint_type' 'waterpoint_type_group']


In [25]:
features_df.head(3)

Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000,1308,1390,1475,34.938093,-9.856322,27889,0,8,13621,...,3,7,3,0,0,2,1,1,0,0
8776,0,908,1399,1735,34.698766,-2.147466,12883,0,0,10916,...,2,7,3,4,4,9,5,0,0,0
34310,25,583,686,103,37.460664,-3.821329,1444,0,3,17145,...,6,7,3,0,0,7,3,0,3,0


In [26]:
# The features and labels are taken out of their dataframe
# and put into a numpy.ndarray and list, respectively.
X = features_df.as_matrix()
y = labels_df["status_group"].tolist()

In [27]:
# logistic regression
import sklearn.linear_model
import sklearn.cross_validation

clf = sklearn.linear_model.LogisticRegression()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print( score )


[ 0.62510101  0.62626263  0.62030303]


In [28]:
# random forest
import sklearn.tree
import sklearn.ensemble

clf = sklearn.tree.DecisionTreeClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print( score )

[ 0.73373737  0.73338384  0.73626263]


In [29]:
clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print( score )

[ 0.78631313  0.78666667  0.78267677]


https://civisanalytics.com/blog/data-science/2015/12/23/workflows-in-python-curating-features-and-thinking-scientifically-about-algorithms/

In [30]:
features_df.payment_type.unique()

array([3, 2, 6, 1, 0, 5, 4])

In [31]:
col = features_df.payment_type.tolist()
col = np.reshape( col, (len(col), 1) )
type(col)
import sklearn.preprocessing
enc = sklearn.preprocessing.OneHotEncoder()
enc.fit(col)

OneHotEncoder(categorical_features='all', dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [32]:
# NOTE: OHE Only takes integers as input
# 'The input to this transformer should be a matrix of integers' - from:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
# one-hot-encoder function that takes the data frame and the title of a column
# and returns the same data frame but one-hot encoding performed on the indicated feature.
# using the scikit-learn OneHotEncoder object, but pandas also has a function called get_dummies()
#that does effectively the same thing. In fact, I find get_dummies() easier to use in many cases,
#but I still find it worthwhile to see a more “manual” version of the transformation at least once.
import sklearn.preprocessing

def hot_encoder(df, column_name):
    column = df[column_name].tolist()
    column = np.reshape( column, (len(column), 1) )  ### needs to be an N x 1 numpy array
    enc = sklearn.preprocessing.OneHotEncoder()
    enc.fit( column )
    new_column = enc.transform( column ).toarray()
    column_titles = []
    ### making titles for the new columns, and appending them to dataframe
    for ii in range( len(new_column[0]) ):
        this_column_name = column_name+"_"+str(ii)
        df[this_column_name] = new_column[:,ii]
    return df

In [33]:
print(features_df.columns.values)


features_df.drop( "funder", axis=1, inplace=True )
features_df.drop( "installer", axis=1, inplace=True )
features_df.drop( "wpt_name", axis=1, inplace=True )
features_df.drop( "subvillage", axis=1, inplace=True )
features_df.drop( "ward", axis=1, inplace=True )

names_of_columns_to_transform.remove("funder")
names_of_columns_to_transform.remove("installer")
names_of_columns_to_transform.remove("wpt_name")
names_of_columns_to_transform.remove("subvillage")
names_of_columns_to_transform.remove("ward")

# perform OHE for columns of interest that needs to be transformed
for feature in names_of_columns_to_transform:
    features_df = hot_encoder( features_df, feature )

features_df.head()

['amount_tsh' 'funder' 'gps_height' 'installer' 'longitude' 'latitude'
 'wpt_name' 'num_private' 'basin' 'subvillage' 'region' 'region_code'
 'district_code' 'lga' 'ward' 'population' 'public_meeting' 'recorded_by'
 'scheme_management' 'scheme_name' 'permit' 'construction_year'
 'extraction_type' 'extraction_type_group' 'extraction_type_class'
 'management' 'management_group' 'payment' 'payment_type' 'water_quality'
 'quality_group' 'quantity' 'quantity_group' 'source' 'source_type'
 'source_class' 'waterpoint_type' 'waterpoint_type_group']


Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,lga,...,waterpoint_type_3,waterpoint_type_4,waterpoint_type_5,waterpoint_type_6,waterpoint_type_group_0,waterpoint_type_group_1,waterpoint_type_group_2,waterpoint_type_group_3,waterpoint_type_group_4,waterpoint_type_group_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000,1390,34.938093,-9.856322,0,8,7,11,5,26,...,0,0,0,0,1,0,0,0,0,0
8776,0,1399,34.698766,-2.147466,0,0,13,20,2,34,...,0,0,0,0,1,0,0,0,0,0
34310,25,686,37.460664,-3.821329,0,3,8,21,4,91,...,1,0,0,0,1,0,0,0,0,0
67743,0,263,38.486161,-11.155298,0,2,11,90,63,10,...,1,0,0,0,1,0,0,0,0,0
19728,0,0,31.130847,-1.825359,0,0,2,18,1,31,...,0,0,0,0,1,0,0,0,0,0


In [34]:
features_df.shape

(59400, 3031)

In [35]:
X = features_df # update the training data to that that has been OHE
X.shape # note the number of columns representing the number of features!

(59400, 3031)

In [36]:
# select the best features 'or so'
import sklearn.feature_selection

select = sklearn.feature_selection.SelectKBest(k=800)
X = features_df
selected_X = select.fit_transform(X, y)

print( selected_X.shape )



(59400, 800)


In [48]:
selected_X0 = select.fit(X, y)



In [37]:
# from warning above shows that column 12 and 191 are CONSTANT
# test plot their value 
#X[X.columns[12]]
#X[X.columns[191]]
# NOTE: sklearn.feature_selection

https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/

In [38]:
# re-run the random forest with just a X features
clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.cross_validation.cross_val_score( clf, selected_X, y )
print( score )

[ 0.78409091  0.78409091  0.7770202 ]


### Pipeline Explanation
- Import pipeline with:
- import sklearn.pipeline
<ol>
<li>Import the required modules including the pipeline module</li>
<li>Instantiate the required modules like select best and random forest classifier etc</li>
<li>Build the steps list of tuples ('customNameOfStep', instantiatedStepVariableName) </li>
<li>Instantiate pipeline </li>
</ol>

In [47]:
type(steps)

list

In [39]:
# Data setup
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42)

import sklearn.pipeline

# initialize the modules that will be used in the pipeline
# select the minimal k best features
select = sklearn.feature_selection.SelectKBest(k=50)
# initialize classifier
clf = sklearn.ensemble.RandomForestClassifier()

# Build the steps list of tuples ('customNameOfStep', instantiatedStepVariableName)
# Note the customNameOfStep - is a user selected name
# instantiatedStepVariableName - this is the variable name of the instantiated 
steps = [('feature_selection', select),
        ('random_forest', clf)]

# Instantiate the pipeline
pipeline = sklearn.pipeline.Pipeline(steps)

### fit your pipeline on X_train and y_train
pipeline.fit( X_train, y_train )
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )
### test your predictions using sklearn.classification_report()
report = sklearn.metrics.classification_report( y_test, y_prediction )
### and print the report
print(report)

  313  316  318  319  323  354  364  365  378  380  391  409  425  427  442
  444  458  463  500  506  507  520  528  533  534  546  548  559  566  570
  572  575  586  589  593  594  599  600  603  613  626  628  637  640  648
  676  682  689  690  696  700  702  709  711  712  719  727  729  751  762
  770  771  772  779  789  794  802  808  809  810  813  819  827  838  844
  846  850  868  874  887  891  893  901  907  914  915  919  936  942  952
  955  960  979 1000 1011 1028 1038 1042 1045 1048 1055 1056 1090 1098 1099
 1102 1105 1116 1117 1126 1128 1136 1156 1159 1165 1176 1195 1242 1246 1249
 1253 1265 1271 1307 1309 1312 1319 1340 1342 1356 1370 1373 1413 1431 1440
 1449 1452 1458 1460 1480 1489 1490 1491 1505 1511 1528 1535 1542 1552 1553
 1622 1628 1644 1652 1670 1672 1678 1691 1700 1704 1711 1722 1747 1760 1779
 1780 1783 1804 1817 1819 1820 1822 1825 1838 1843 1846 1849 1850 1853 1869
 1870 1873 1879 1912 1920 1932 1953 1954 1955 1958 1964 1973 1984 1986 2017
 2019 2031 2

             precision    recall  f1-score   support

          0       0.76      0.77      0.76      7458
          1       0.38      0.35      0.37      1425
          2       0.80      0.80      0.80     10719

avg / total       0.75      0.76      0.75     19602



https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/

In [40]:
pipeline

Pipeline(steps=[('feature_selection', SelectKBest(k=50, score_func=<function f_classif at 0x1089da488>)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

  270  274  283  289  290  301  313  316  317  318  319  323  331  336  347
  348  350  351  354  357  364  365  371  376  377  378  380  385  391  395
  396  407  409  425  427  429  442  444  453  458  463  500  506  507  517
  520  521  525  528  533  534  542  546  547  548  555  559  560  566  568
  570  572  573  575  580  583  584  586  589  593  594  596  599  600  603
  607  609  611  613  617  620  626  628  629  630  637  640  647  648  651
  652  656  661  666  668  672  676  682  683  689  690  696  698  700  701
  702  705  707  709  710  711  712  719  723  725  727  728  729  735  751
  762  770  771  772  779  789  794  802  803  808  809  810  813  814  817
  818  819  827  831  833  838  839  844  846  850  857  862  868  871  874
  882  887  891  893  901  903  907  914  915  919  921  932  936  940  942
  951  952  955  958  960  965  977  979  980  986 1000 1006 1007 1011 1020
 1027 1028 1038 1042 1045 1048 1055 1056 1075 1090 1093 1098 1099 1102 1105
 1116 1117 1

In [44]:
print(report)

             precision    recall  f1-score   support

          0       0.83      0.69      0.76      7458
          1       0.63      0.19      0.30      1425
          2       0.76      0.91      0.83     10719

avg / total       0.77      0.78      0.76     19602



### Stratified Shuffle Split Sample

In [67]:
from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(y,2, test_size=0.2, random_state=42)
for train_index, test_index in sss:
    print("TRAIN indices:", train_index, "TEST indices:", test_index)

TRAIN indices: [56033 42149 50644 ..., 23965 52811 37112] TEST indices: [52237  4107 28169 ..., 54782  2110  3833]
TRAIN indices: [31788 54920  3600 ..., 37340 40168 39623] TEST indices: [21495  5759  4607 ..., 28518   447 26835]


In [139]:
X.shape

(59400, 3031)

In [173]:
#X_train = X[train_index]
print(len(train_index))
print(len(test_index))
print(len(train_index) + len(test_index))

47520
11880
59400


In [192]:
X_train = X.iloc[train_index,]
X_test = X.iloc[test_index,]
y_train = labels_df.values[train_index]
y_test = labels_df.values[test_index]
print(X_train.shape)
print(X_test.shape)

(47520, 3031)
(11880, 3031)


In [193]:
type(labels_df)

pandas.core.frame.DataFrame

In [166]:
#y_train = y_train.tolist()
#y_test = y_test.tolist()
#print(len(y_train))
#print(len(y_test))

47520
11880


In [169]:
X_train.dropna().shape

(47520, 3031)

In [157]:
X_test.dropna().shape

(11880, 3031)

In [200]:
len(y_train)

47520

In [233]:
y_test = labels_df.iloc[test_index] # iloc does not give those spurious NaNs
y_test0 = y_test['status_group'].tolist()
print(type(y_test0))
print(len(y_test0))

<class 'list'>
11880


In [236]:
y_train = labels_df.iloc[train_index]
y_train0 = y_train['status_group'].tolist()
print(type(y_train0))
print(len(y_train0))

<class 'list'>
47520


In [205]:
#out = np.concatenate(input_list).ravel().tolist()

In [237]:
# random forest
import sklearn.tree
import sklearn.ensemble

clf = sklearn.tree.DecisionTreeClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X_train, y_train0 )
print( score )

[ 0.74788537  0.75042616  0.74272366]


In [238]:
clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X_train, y_train0 )
print( score )

[ 0.77477591  0.77441758  0.77214471]


In [239]:
import sklearn.grid_search


parameters = dict(feature_selection__k=[100], # could be more [100, 200] 
              #random_forest__n_estimators=[50], # [50, 100, 200] # the number of trees
              random_forest__min_samples_split=[10], # [2, 3, 4, 5, 10]
              random_forest__min_samples_leaf = [5, 10],
              random_forest__n_estimators = [10, 20])

cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters)

cv.fit(X_train, y_train0)
y_predictions = cv.predict(X_test)
report = sklearn.metrics.classification_report( y_test0, y_predictions )

  287  289  290  292  295  298  309  316  325  327  328  331  341  346  348
  364  369  377  378  380  392  395  398  409  425  427  434  448  452  457
  463  467  474  494  495  501  503  505  507  508  511  515  529  534  552
  556  572  576  594  596  602  603  606  607  608  609  612  620  626  628
  630  636  637  651  662  665  674  682  689  691  700  702  710  711  713
  721  725  727  734  735  747  751  752  753  770  772  802  811  813  828
  829  836  848  862  889  891  900  903  907  931  932  944  954  955  960
  963  979 1000 1001 1011 1016 1021 1033 1035 1037 1048 1053 1082 1090 1093
 1097 1098 1105 1116 1117 1120 1127 1128 1131 1136 1155 1157 1159 1162 1165
 1167 1172 1176 1180 1184 1185 1194 1195 1210 1216 1238 1249 1264 1312 1316
 1322 1328 1334 1336 1340 1356 1357 1363 1367 1373 1385 1393 1404 1408 1416
 1424 1427 1431 1435 1437 1451 1452 1454 1458 1468 1479 1490 1499 1507 1511
 1512 1525 1528 1535 1548 1552 1553 1556 1561 1574 1576 1579 1580 1581 1587
 1613 1615 1

In [240]:
print(report)

             precision    recall  f1-score   support

          0       0.83      0.70      0.76      4565
          1       0.68      0.20      0.30       863
          2       0.76      0.91      0.83      6452

avg / total       0.78      0.78      0.76     11880

