In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas
import random
import matplotlib.pyplot as plt
import matplotlib
import copy
from catboost import CatBoostClassifier

### Information Gain

In [2]:
def gini(x):
    _, counts = np.unique(x, return_counts=True)
    proba = counts / len(x)
    return np.sum(proba * (1 - proba))
    
def entropy(x):
    _, counts = np.unique(x, return_counts=True)
    proba = counts / len(x)
    return -np.sum(proba * np.log2(proba))

def gain(left_y, right_y, criterion):
    y = np.concatenate((left_y, right_y))
    return criterion(y) - (criterion(left_y) * len(left_y) + criterion(right_y) * len(right_y)) / len(y)

### Decision Tree

In [3]:
class DecisionTreeLeaf():
    def __init__(self, labels):
        unique, counts = np.unique(labels, return_counts=True)
        p = counts/np.sum(counts)
        dct = dict(zip(unique, p))
        self.y = max(dct)
        self.cl = dct

In [4]:
class DecisionTreeNode():     
    def __init__(self, split_dim, left, right):
        self.split_dim = split_dim
        self.split_value = 0
        self.left = left
        self.right = right

In [5]:
class DecisionTree:
    def __init__(self, X, y,
                 criterion="gini", 
                 max_depth=None, 
                 min_samples_leaf=1,
                 max_features="auto"):
        
        if criterion != 'gini' and criterion != 'entropy':
            raise ValueError('Unknown criterion')
        
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        
        self.max_features = round(np.sqrt(X.shape[1])) if max_features=="auto" else max_features
                
        indices = np.linspace(0, X.shape[0]-1, X.shape[0], dtype=int)
        new_indices = np.random.choice(X.shape[0], X.shape[0])
        oob_indices = np.setdiff1d(indices, new_indices)
           
        self.oob_X = X[oob_indices]
        self.oob_y = y[oob_indices]
                
        self.root = self.build(X[new_indices], y[new_indices])
        
    
    def build(self, X, y, curr_depth=0):
        if curr_depth == self.max_depth:
            step = DecisionTreeLeaf(y)
    
        else:
            IG = 0
            curr_dim = None
            max_left = None
            max_right = None
            
            for dim in np.random.choice(X.shape[1], self.max_features, replace=False):
                    
                left = np.where(X[:,dim] == 0)
                right = np.where(X[:,dim] == 1)
                    
                if min(y[left].shape[0], y[right].shape[0]) < self.min_samples_leaf:
                    continue
                                                   
                new_IG = gain(y[left], y[right], gini) if self.criterion=='gini' else gain(y[left], y[right], entropy)
                    
                if new_IG > IG: 
                    IG = new_IG
                    curr_dim = dim
                    max_left = left
                    max_right = right
                                        
            if IG == 0:
                step = DecisionTreeLeaf(y)
                
            else:
                curr_depth += 1
                left_node = self.build(X[max_left], y[max_left], curr_depth)
                right_node = self.build(X[max_right], y[max_right], curr_depth)
                step = DecisionTreeNode(curr_dim, left_node, right_node)
            
        return step
    
    
    def predict_proba(self, X):           
        n = X.shape[0]
        indices = np.linspace(0, n-1, n, dtype=int).reshape(n, 1)
        
        X_ = np.copy(X)
        data = np.concatenate((X_, indices), axis=1)
        curr_node = self.root
        
        new_array = self.func(data, curr_node)
        sorted_array = new_array[new_array[:,0].argsort()]
        
        return sorted_array[:,1]
        
        
    def func(self, data, curr_node):
        
        if isinstance(curr_node, DecisionTreeLeaf):
            dicts = np.array([curr_node.cl for i in range(data.shape[0])])
            indices = data[:,-1]           
            return np.stack((indices, dicts)).T
        
        else:
            left_node = curr_node.left
            left_part = data[data[:,curr_node.split_dim] <= curr_node.split_value]
            
            right_node = curr_node.right
            right_part = data[data[:,curr_node.split_dim] > curr_node.split_value]
    
            return np.concatenate((self.func(left_part, left_node), self.func(right_part, right_node)))
            
        
    def predict(self, X):
        proba = self.predict_proba(X)
        return [max(p.keys(), key=lambda k: p[k]) for p in proba]

### Random Forest

In [6]:
class RandomForestClassifier:
    def __init__(self, criterion="gini", max_depth=None, min_samples_leaf=1, max_features="auto", n_estimators=10):
        if criterion != 'gini' and criterion != 'entropy':
            raise ValueError('Unknown criterion')
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        
        self.max_features = round(np.sqrt(X.shape[1])) if max_features=="auto" else max_features
        self.n_estimators = n_estimators
        
        self.features = None
        self.forest = None
        
    
    def fit(self, X, y):
        self.features = X.shape[1]
        self.forest = [DecisionTree(X, y,
                                  criterion=self.criterion, 
                                  max_depth=self.max_depth, 
                                  min_samples_leaf=self.min_samples_leaf, 
                                  max_features=self.max_features) for i in range(self.n_estimators)]

        
    def predict(self, X):        
        prediction_labels = np.zeros((self.n_estimators, X.shape[0]), dtype=object)
        
        for i, tree in enumerate(self.forest): 
            prediction_labels[i] = tree.predict(X)
 
        u, indices = np.unique(prediction_labels, return_inverse=True)
        return u[np.argmax(np.apply_along_axis(np.bincount, 0, indices.reshape(prediction_labels.shape),
                                               None, np.max(indices) + 1), axis=0)]

### Feature importance

In [7]:
def feature_importance(rfc):
    importance = np.zeros(rfc.features)
    
    for feature in range(rfc.features):
        for tree in rfc.forest:
        
            y_true = tree.oob_y
            y_pred = tree.predict(tree.oob_X) 
            acc = np.mean(y_true == y_pred)
            err_oob = 1 - acc
        
            X_ = np.copy(tree.oob_X)
            np.random.shuffle(X_[:,feature])
            y_pred_j = tree.predict(X_) 
            acc_j = np.mean(y_true == y_pred_j)
            err_oob_j = 1 - acc_j
            
            importance[feature] += err_oob_j - err_oob      
            
    importance /= rfc.n_estimators
    return importance

        
def most_important_features(importance, names, k=20):
    # Выводит названия k самых важных признаков
    idicies = np.argsort(importance)[::-1][:k]
    return np.array(names)[idicies]

### Проверка на синтетическом датасете

In [16]:
def synthetic_dataset(size):
    X = [(np.random.randint(0, 2), np.random.randint(0, 2), i % 6 == 3, 
          i % 6 == 0, i % 3 == 2, np.random.randint(0, 2)) for i in range(size)]
    y = [i % 3 for i in range(size)]
    return np.array(X), np.array(y)

X, y = synthetic_dataset(1000)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X, y)
print("Accuracy:", np.mean(rfc.predict(X) == y))
print("Importance:", feature_importance(rfc))

Accuracy: 1.0
Importance: [ 0.00135668  0.00174307  0.16618341  0.16306743  0.31957842 -0.00244449]


### Проверка на датасете VK

In [20]:
def read_dataset(path):
    dataframe = pandas.read_csv(path, header=0)
    dataset = dataframe.values.tolist()
    random.shuffle(dataset)
    y_age = [row[0] for row in dataset]
    y_sex = [row[1] for row in dataset]
    X = [row[2:] for row in dataset]
    
    return np.array(X), np.array(y_age), np.array(y_sex), list(dataframe.columns)[2:]

In [21]:
X, y_age, y_sex, features = read_dataset("vk.csv")
X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)

#### Возраст

In [22]:
%%time
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(X_train, y_age_train)
print("Accuracy:", np.mean(rfc.predict(X_test) == y_age_test))
print("Most important features:")
for i, name in enumerate(most_important_features(feature_importance(rfc), features, 20)):
    print(str(i+1) + ".", name)

Accuracy: 0.7200504413619168
Most important features:
1. ovsyanochan
2. styd.pozor
3. 4ch
4. rhymes
5. mudakoff
6. dayvinchik
7. pravdashowtop
8. rapnewrap
9. iwantyou
10. xfilm
11. reflexia_our_feelings
12. bot_maxim
13. ne1party
14. i_des
15. ne.poverish
16. pixel_stickers
17. leprum
18. memeboizz
19. tumblr_vacuum
20. soverwenstvo.decora
Wall time: 4min 27s


#### Пол

In [23]:
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(X_train, y_sex_train)
print("Accuracy:", np.mean(rfc.predict(X_test) == y_sex_test))
print("Most important features:")
for i, name in enumerate(most_important_features(feature_importance(rfc), features, 20)):
    print(str(i+1) + ".", name)

Accuracy: 0.8575031525851198
Most important features:
1. 40kg
2. girlmeme
3. zerofat
4. mudakoff
5. be.women
6. modnailru
7. igm
8. bon
9. femalemem
10. sh.cook
11. rapnewrap
12. soverwenstvo.decora
13. thesmolny
14. reflexia_our_feelings
15. 9o_6o_9o
16. combovine
17. i_d_t
18. bot_maxim
19. cook_good
20. be.beauty


### CatBoost

In [24]:
from catboost import Pool

X, y = synthetic_dataset(1000)
cat_features = [0]

model = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')

train_dataset = Pool(data=X,
                     label=y,
                     cat_features=cat_features)

eval_dataset = Pool(data=X,
                    label=y,
                    cat_features=cat_features)

model.fit(train_dataset, silent=True)
y_pred = model.predict(eval_dataset).squeeze()

print("Accuracy:", np.mean(y_pred == y))
print(model.get_feature_importance(data=None,
                       prettified=True,
                       thread_count=-1,
                       verbose=False))

Accuracy: 1.0
  Feature Id  Importances
0          4    48.790074
1          2    29.366018
2          3    21.843610
3          1     0.000298
4          0     0.000000
5          5     0.000000


### VK dataset

In [25]:
X, y_age, y_sex, features = read_dataset("vk.csv")
X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)
X_train, X_eval, y_age_train, y_age_eval, y_sex_train, y_sex_eval = train_test_split(X_train, y_age_train, y_sex_train, train_size=0.8)

#### Возраст

In [26]:
train_dataset = Pool(data=X_train, label=y_age_train)
test_dataset = Pool(data=X_test, label=y_age_test)

model = CatBoostClassifier(loss_function='MultiClass', silent=True)

model.fit(train_dataset)
y_pred = model.predict(test_dataset).squeeze()
print("Accuracy:", np.mean(y_pred == y_age_test))
print(model.get_feature_importance(data=None,
                       prettified=True,
                       thread_count=-1,
                       verbose=False))

Accuracy: 0.725094577553594
    Feature Id  Importances
0          135     3.062867
1           38     2.850990
2           75     2.351215
3           23     2.245430
4          101     2.200626
..         ...          ...
144        141     0.141622
145         15     0.137504
146         97     0.124924
147        140     0.113374
148         35     0.000000

[149 rows x 2 columns]


#### Пол

In [27]:
X, y_age, y_sex, features = read_dataset("vk.csv")
X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)
X_train, X_eval, y_age_train, y_age_eval, y_sex_train, y_sex_eval = train_test_split(X_train, y_age_train, y_sex_train, train_size=0.8)

In [28]:
train_dataset = Pool(data=X_train, label=y_sex_train)
test_dataset = Pool(data=X_test, label=y_sex_test)

model = CatBoostClassifier(loss_function='MultiClass', silent=True)

model.fit(train_dataset)
y_pred = model.predict(test_dataset).squeeze()
print("Accuracy:", np.mean(y_pred == y_sex_test))
print(model.get_feature_importance(data=None,
                       prettified=True,
                       thread_count=-1,
                       verbose=False))

Accuracy: 0.8852459016393442
    Feature Id  Importances
0           22     3.791130
1           38     3.148981
2           10     2.549417
3           20     2.028881
4          133     1.868518
..         ...          ...
144         49     0.149827
145         56     0.139478
146        107     0.123397
147         70     0.117040
148         85     0.110582

[149 rows x 2 columns]
