In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import metrics 

In [3]:
df = pd.read_csv('kp_classify.csv')
df.head()

Unnamed: 0,match,image_tag,pos,point_x,point_y,size,angle,response,octave
0,0,S6626S03,right-side,48,147,31.0,306.542664,0.00044,0
1,0,S6278S04,right-side,58,69,31.0,311.590851,7.8e-05,0
2,0,S6626S03,right-side,43,146,37.200001,334.363556,0.000881,1
3,0,S6278S04,right-side,47,59,44.640003,19.241217,0.000496,2
4,0,S6626S03,right-side,45,139,37.200001,355.814758,0.001043,1


In [4]:
# dividing the datasets into two parts i.e. training datasets and test datasets
mapping = {'right-side': 0, 'left-side': 1, 'bottom': 2, 'complete': 3}
df['pos'] = df['pos'].map(mapping)

y = df['match']
X = df.drop(labels=['match', 'image_tag'], axis=1)


print(y[:2])
print(X[:2])

0    0
1    0
Name: match, dtype: int64
   pos  point_x  point_y  size       angle  response  octave
0    0       48      147  31.0  306.542664  0.000440       0
1    0       58       69  31.0  311.590851  0.000078       0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

In [6]:
def calculate_accuracy(X_train, X_test, y_train, y_test, parameters = {'n_estimators': 100,
                                     'max_depth': None,
                                     'max_features': 'sqrt',
                                     'criterion': 'gini',
                                     'bootstrap': True,
                                     'class_weight' : "balanced",
                                     'max_samples': None
                                     }):
    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test) 

    # using metrics module for accuracy calculation
    accuracy = metrics.accuracy_score(y_test, y_pred)
    feature_imp = pd.Series(clf.feature_importances_, index = X.keys()).sort_values(ascending = False)
    return accuracy, feature_imp, clf

In [7]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 15, None],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample'],
    'max_samples': [None]
}

In [11]:
for n_estimators in [200]:
    for max_depth in [None]:
        for class_weight in ['balanced']:
            parameters = {'n_estimators': n_estimators,'max_depth': max_depth,'class_weight' : class_weight}
            print(f"Test {n_estimators} + {max_depth} + {class_weight}", calculate_accuracy(X_train, X_test, y_train, y_test, parameters))

Test 100 + None + balanced (0.8931414979021441, angle       0.307566
response    0.303744
point_y     0.182852
point_x     0.175783
pos         0.012423
octave      0.008936
size        0.008695
dtype: float64)
Test 200 + None + balanced (0.8932901648551324, angle       0.306026
response    0.303534
point_y     0.182346
point_x     0.176554
pos         0.012745
size        0.009415
octave      0.009380
dtype: float64)
Test 300 + None + balanced (0.8932571277544683, angle       0.307778
response    0.304208
point_y     0.183348
point_x     0.176720
pos         0.012288
size        0.007853
octave      0.007805
dtype: float64)
