In [61]:
import numpy as np
from sklearn import preprocessing
from sklearn import feature_extraction
from sklearn import neighbors
from sklearn import datasets
from sklearn import ensemble
import pandas as pd

  from numpy.core.umath_tests import inner1d


In [3]:
#Feature with nominal classes
feature_labeler = np.array([["word"],
                           ["second"],
                           ["word"],
                           ["third"],
                           ["word"]])
labeler = preprocessing.LabelBinarizer()
labeler.fit_transform(feature_labeler)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [10]:
#output the classes
labeler.classes_

#output all 
labeler.inverse_transform(labeler.transform(feature_labeler))

#pandas.core.frame.DataFrame
pd.get_dummies(feature_labeler[:,0])

Unnamed: 0,second,third,word
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [11]:
#multi class features

feature_multi = np.array([["Chemistry", "Bio-Chem"],
                         ["Physics", "Astro-Physics"],
                         ["Chemistry", "Bio-Chem"],
                         ["Biology", "Bio-Chem"],
                         ["Chemistry", "Electrochemistry"]])

labeler_multi = preprocessing.MultiLabelBinarizer()

labeler_multi.fit_transform(feature_multi)

array([[0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 0]])

In [12]:
labeler_multi.classes_

array(['Astro-Physics', 'Bio-Chem', 'Biology', 'Chemistry',
       'Electrochemistry', 'Physics'], dtype=object)

In [16]:
#Turn scaling text based features into numerical data

txt_df = pd.DataFrame({"Score":["Low","Low","Medium","Medium","High"]})

scale_text_to_num = {"Low":1,
                    "Medium":2,
                    "High":3}

num_series = txt_df["Score"].replace(scale_text_to_num)
type(num_series)


pandas.core.series.Series

In [23]:
#Dictionary converted it into feature matrix

data_dict = ({"logic": 5, "gate": 7},
            {"logic": 3, "XOR": 2},
            {"logic": 1, "AND": 2},
            {"logic": 2, "gate": 1})

dictvectorizer = feature_extraction.DictVectorizer(sparse=False)

features_dict = dictvectorizer.fit_transform(data_dict)

features_dict

array([[0., 0., 7., 5.],
       [0., 2., 0., 3.],
       [2., 0., 0., 1.],
       [0., 0., 1., 2.]])

In [24]:
#get feature names. Alpha order
names = dictvectorizer.get_feature_names()
names

['AND', 'XOR', 'gate', 'logic']

In [27]:
dict_df = pd.DataFrame(features_dict, columns=names)
dict_df

Unnamed: 0,AND,XOR,gate,logic
0,0.0,0.0,7.0,5.0
1,0.0,2.0,0.0,3.0
2,2.0,0.0,0.0,1.0
3,0.0,0.0,1.0,2.0


In [45]:
#Categorical feature containing missing values, and replace it with predicted values.

X = np.array([[0, 3.10, 2.45],
              [1, .18, .33],
              [0, 2.22, 3.27],
              [1, -.21, -.19]])

#create a 0 and a 1
X_with_nan = np.array([[np.nan, 2.87, 3.31],
                       [np.nan, 0.67, -0.22]])

#possibe weight uniform, distance, or "callable" user defined function
clf_kn = neighbors.KNeighborsClassifier(3, weights='distance')

trained_kn_model = clf.fit(X[:,1:], X[:,0])

imputed_values = trained_kn_model.predict(X_with_nan[:,1:])

X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))

np.vstack((X_with_imputed, X))

array([[ 0.  ,  2.87,  3.31],
       [ 1.  ,  0.67, -0.22],
       [ 0.  ,  3.1 ,  2.45],
       [ 1.  ,  0.18,  0.33],
       [ 0.  ,  2.22,  3.27],
       [ 1.  , -0.21, -0.19]])

In [47]:
#alt solution is to just fill with most frequest feature value on axis
#Join matricies
X_complete = np.vstack((X_with_nan, X))

imputer = preprocessing.Imputer(strategy='most_frequent', axis=0)

imputer.fit_transform(X_complete)

array([[ 0.  ,  2.87,  3.31],
       [ 0.  ,  0.67, -0.22],
       [ 0.  ,  3.1 ,  2.45],
       [ 1.  ,  0.18,  0.33],
       [ 0.  ,  2.22,  3.27],
       [ 1.  , -0.21, -0.19]])

In [64]:
#imbalanced classes

iris = datasets.load_iris()

features_iris = iris.data

targets_iris = iris.target

feature_set = features_iris[40:,:]
target_set = targets_iris[40:]

target_where = np.where((target_set == 0),0,1)
target_where

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [63]:
#create weights

weights = {0:.9,1:0.1}
ensemble.RandomForestClassifier(class_weight=weights)

RandomForestClassifier(bootstrap=True, class_weight={0: 0.9, 1: 0.1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [79]:
i_class0 = np.where(target_where == 0)[0]
i_class1 = np.where(target_where == 1)[0]

length_class0 = len(i_class0)
length_class1 = len(i_class1)

#downsampling
i_class1_downsample = np.random.choice(i_class1, size=length_class0, replace=False)

#vector
np.hstack((target_where[i_class0], target_where[i_class1_downsample]))

#matrix
np.vstack((features_iris[i_class0,:], features_iris[i_class1_downsample,:]))[0:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [80]:
i_class0_upsampled = np.random.choice(i_class0, size=length_class1, replace=True)

#vector
np.concatenate((target_where[i_class0_upsampled], target_where[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [81]:
#matrix
np.vstack((features_iris[i_class0_upsampled,:], features_iris[i_class1,:]))[0:5]

array([[4.9, 3. , 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [4.6, 3.4, 1.4, 0.3],
       [5.4, 3.9, 1.7, 0.4]])