In [3]:
import numpy as np
import pandas as pd
from scipy.stats import mode
from DecisionTreeClassifier import DecisionTreeClassifier
from DecisionTreeRegressor import DecisionTreeRegressor

In [4]:
def bootstrap(data,m_datasets):
    '''data: must be a dataframe
       n_samples: number of samples in each dataframe(cant be used if frac is used)
    '''
    datasets = []
    for index in range(m_datasets):
        dataset = data.sample(axis=0,frac=1,replace=True) # sampling by replacement the whole dataset
        datasets.append(dataset)
    return datasets

In [5]:
data = pd.read_csv('../datasets/titanic.csv')

In [6]:
mode_embarked = data.Embarked.mode()[0]
age_median = data.Age.median()
data = data.fillna({'Age':age_median,'Embarked':mode_embarked})

In [7]:
data['Label'] = data.Survived

In [8]:
data = data.drop(['PassengerId','Survived','Name','Ticket','Cabin'],axis=1)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
Label       891 non-null int64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [10]:
class RandomForestClassifier():
    def __init__(self,n_estimators=50,max_depth=None,min_samples=3):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples = min_samples
    
    def train(self,X,y):
        data = pd.concat([X,y],axis=1)
        datasets = bootstrap(data,self.n_estimators)
        self.DecisionTreeClassifier = DecisionTreeClassifier(max_depth=self.max_depth,min_samples=self.min_samples,choose_k_lessthan_d=True)
        self.forest = []
        for index in range(self.n_estimators):
            dataset = datasets[index]
            tree = self.DecisionTreeClassifier.train(dataset.iloc[:,:-1],dataset.iloc[:,-1])
            self.forest.append(tree)
        
        assert self.n_estimators == len(self.forest)
        return self.forest
    
    def predict(self,X):
        #forest_multi_example predict
        predictions = []
        for index in range(len(X)):
            prediction = forest_one_example_predict(X.iloc[index,:],self.forest)
            predictions.append(prediction)
        return np.array(predictions)

In [11]:
def tree_one_example_predict(example,tree):
    question = list(tree.keys())[0]
    col_name,operator,value = question.split()
    if operator == '<=':
        if str(example[col_name]) <= value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
         if str(example[col_name]) == value:
            answer = tree[question][0]
         else:
            answer = tree[question][1]
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return tree_one_example_predict(example,residual_tree)

In [12]:
def forest_one_example_predict(example,forest):
    predictions = []
    for index in range(len(forest)):
        prediction = tree_one_example_predict(example,forest[index])
        predictions.append(prediction)
    mode_info = mode(predictions)
    mode_prediction = int(mode_info[0])
    #mode_prediction = np.mean(predictions)
    return mode_prediction

In [13]:
clf = RandomForestClassifier(n_estimators=100,min_samples=1)

In [15]:
forest = clf.train(pd.get_dummies(train.iloc[:,:-1],drop_first=True),train.iloc[:,-1])

In [16]:
pred = clf.predict(pd.get_dummies(test.iloc[:,:-1],drop_first=True))

In [17]:
truth = test.values[:,-1]
a = pred == truth
a.sum()/a.size

0.6536312849162011

In [20]:
pred

array([0.56, 0.34, 0.34, 0.88, 0.38, 0.48, 0.26, 0.58, 0.38, 0.7 , 0.24,
       0.04, 0.22, 0.78, 0.32, 0.02, 0.46, 0.34, 0.14, 0.36, 0.66, 0.56,
       0.6 , 0.24, 0.74, 0.22, 0.92, 0.74, 0.36, 0.32, 0.84, 0.62, 0.82,
       0.22, 0.72, 0.4 , 0.22, 0.5 , 0.92, 0.4 , 1.  , 0.54, 0.2 , 0.36,
       0.46, 0.62, 0.2 , 0.72, 0.38, 0.36, 0.24, 0.26, 0.28, 0.38, 0.96,
       1.  , 0.46, 0.3 , 0.76, 0.52, 0.06, 0.46, 0.52, 0.46, 0.28, 0.44,
       0.44, 0.58, 0.54, 0.84, 0.44, 0.58, 0.32, 0.2 , 0.62, 0.3 , 0.54,
       0.02, 0.48, 0.12, 0.22, 0.36, 0.92, 0.5 , 0.74, 0.4 , 0.28, 0.7 ,
       0.32, 0.32, 0.36, 1.  , 0.96, 0.08, 0.62, 0.22, 0.18, 0.46, 0.22,
       0.42, 0.44, 0.48, 0.36, 0.38, 0.34, 0.46, 0.  , 0.36, 0.2 , 0.32,
       0.36, 0.66, 0.36, 0.58, 0.38, 0.22, 0.24, 0.4 , 0.74, 0.22, 0.4 ,
       0.32, 0.9 , 0.92, 0.4 , 0.52, 0.14, 1.  , 0.36, 0.84, 0.3 , 0.66,
       0.66, 0.44, 0.54, 0.24, 0.36, 0.46, 0.34, 0.38, 0.66, 1.  , 0.46,
       0.58, 1.  , 0.64, 0.38, 0.24, 1.  , 0.22, 0.

In [31]:
test['Pred'] = np.round(pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
a = np.array([0.98,0.67,0.45,0.23])

In [24]:
np.round(a)

array([1., 1., 0., 0.])

In [27]:
P.shape

(181,)

In [37]:
test[test.Pred != test.Label].Probab.values

array([0.56, 0.34, 0.58, 0.24, 0.34, 0.66, 0.6 , 0.62, 0.82, 0.22, 0.72,
       0.54, 0.2 , 0.2 , 0.72, 0.38, 0.36, 0.24, 0.38, 0.96, 0.46, 0.76,
       0.52, 0.52, 0.44, 0.44, 0.58, 0.54, 0.58, 0.62, 0.54, 0.22, 0.74,
       0.08, 0.62, 0.46, 0.22, 0.  , 0.2 , 0.66, 0.58, 0.22, 0.4 , 0.22,
       0.32, 0.9 , 0.52, 0.46, 0.58, 0.64, 0.38, 0.16, 0.4 , 0.6 , 0.3 ])

In [39]:
np.random.randint(low=0, high=len(train), size=4)

array([540, 147, 425, 154])

In [14]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(data,test_size=0.2)

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
clf0 = RandomForestClassifier(n_estimators=100)

In [42]:
clf0.fit(pd.get_dummies(train.iloc[:,:-1],drop_first=True),train.iloc[:,-1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [43]:
pred = clf0.predict(pd.get_dummies(test.iloc[:,:-1],drop_first=True))

In [44]:
truth = test.values[:,-1]
a = pred == truth
a.sum()/a.size

0.8435754189944135

In [45]:
a = np.array([1,2,3,4,5,6,7,8])
np.random.choice(a,size=4,replace=False)

array([2, 1, 7, 4])

In [93]:
8 ** (1/2)

2.8284271247461903

In [27]:
isinstance(np.round(8 ** (1/2)),float)

True