In [1]:
import pandas as pd
import numpy as np

from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from keras.models import load_model
import keras.backend as K

from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold

import seaborn as sns
%pylab inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Populating the interactive namespace from numpy and matplotlib


In [10]:
put_data = pd.read_csv('Putin1.csv', delimiter=';', names=range(4000))
nav_data = pd.read_csv('Navalny1.csv', delimiter=';', names=range(4000))
put_test = pd.read_csv('Putin_test.csv', delimiter=';', names=range(4000))
nav_test = pd.read_csv('Navalny_test.csv', delimiter=';', names=range(4000))

In [64]:
put_num = put_data.shape[0]
put_num

6760

In [65]:
nav_num = nav_data.shape[0]
nav_num

1957

In [17]:
put_test_num = put_test.shape[0]
put_test_num

969

In [18]:
nav_test_num = nav_test.shape[0]
nav_test_num

538

# Training data

In [19]:
data = pd.concat((put_data.iloc[:1957,:], nav_data))

In [20]:
data.shape

(3914, 4000)

In [21]:
data.dropna(axis=1, how='all', inplace=True)
data.fillna(0, inplace=True) # replacing NaNs with zeros
data = data.applymap(int)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1722,1723,1724,1725,1726,1727,1728,1729,1730,1731
0,473365518,20,2,-1,-50498710,-49606842,-48928605,-16389388,0,0,...,0,0,0,0,0,0,0,0,0,0
1,800930,36,2,-1,-106564427,-9758,-160937227,-26419239,-70365191,-25653235,...,0,0,0,0,0,0,0,0,0,0
2,346505303,-1,2,-1,-131188697,-64299218,-123775571,-106823150,-61484302,-151336239,...,0,0,0,0,0,0,0,0,0,0
3,307080832,21,2,-1,-43776317,-57846937,-62965884,-70870022,-23951627,-124305154,...,0,0,0,0,0,0,0,0,0,0
4,404314502,-1,2,5,-113801785,-6726778,-17568841,-94686473,-159187081,-161812969,...,0,0,0,0,0,0,0,0,0,0


In [25]:
publics = np.unique(data.iloc[:, 4:].values) # list of publics
publics.shape

(127235,)

In [26]:
publics = np.delete(publics, np.where(publics == 0)[0]) # deleting id=0
publics.shape

(127234,)

In [27]:
counters = np.zeros(publics.shape[0])
counters.shape

(127234,)

In [28]:
genders = np.unique(data.iloc[:, 2]) # list of genders
genders

array([-1,  0,  1,  2], dtype=int64)

In [29]:
preferences = np.unique(data.iloc[:, 3]) # list of political preferences
preferences

array([-1,  1,  2,  3,  4,  5,  6,  7,  8,  9], dtype=int64)

In [32]:
users_num = data.shape[0] # number of users in dataset
users_num

3914

In [33]:
Ages=[0, 18, 26, 36, 51, 66, 86]

In [34]:
X_ages = np.zeros([users_num, len(Ages)+1], dtype=np.float16)
X_sex = np.zeros([users_num, genders.shape[0]], dtype=np.float16)
X_pref = np.zeros([users_num, preferences.shape[0]], dtype=np.float16)

In [35]:
for i in range(users_num):
    print(str(i)+'/'+str(users_num), end='\r')
    age = data.iloc[i, 1]
    sex = data.iloc[i, 2]
    pref = data.iloc[i, 3]
    publ = data.iloc[i, 4:]
    
    for a in range(len(Ages)): # filling X_ages
        if age < Ages[a]:
            X_ages[i, a] = 1
            break
        elif age >= Ages[len(Ages)-1]:
            X_ages[i, len(Ages)] = 1
            break
    
    for j in range(genders.shape[0]): # filling genders
        if sex == genders[j]:
            X_sex[i, j] = 1
    
    for j in range(preferences.shape[0]): # filling preferences
        if pref == preferences[j]:
            X_pref[i, j] = 1
    
    for j in publ: # counting the number of occurence of each public
        if j == 0:
            break
        publ_index = np.where(publics == j)[0][0]
        counters[publ_index] = counters[publ_index] + 1

3913/3914

In [37]:
def activaton(ind): # for taking into account the order of publics
    return 1/(1+np.exp(0.2*ind)) + 0.5

In [38]:
X_publ = []
X_publ = np.zeros([users_num, publics.shape[0]], dtype=np.float16)
for i in range(users_num):
    print(str(i)+'/'+str(users_num), end='\r')
    publ = data.iloc[i, 4:].values
    for j in range(publ.shape[0]):
        if publ[j] == 0:
            break
        publ_index = np.where(publics == publ[j])[0]
        X_publ[i, publ_index] = activaton(j)

3913/3914

In [39]:
X = np.hstack((X_ages, X_sex, X_pref, X_publ)) # merging all arrays into one
X.shape

(3914, 127256)

In [76]:
# PCA components
n_components = 14

In [41]:
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # cross-validation

In [42]:
pca = PCA(n_components=n_components)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=14, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [43]:
from sklearn.externals import joblib # saving pca
_ = joblib.dump(pca, 'pcaPutNav.save')

In [44]:
X_pca = pca.transform(X)

  return umr_sum(a, axis, dtype, out, keepdims)


# Test data

In [51]:
data_test = pd.concat((put_test, nav_test))
num = data_test.shape[0]
num

1507

In [52]:
data_test.dropna(axis=1, how='all', inplace=True)
data_test.fillna(0, inplace=True) # replacing NaNs with zeros
data_test = data_test.applymap(int)
data_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,194,195,196,197,198,199,200,201,202,203
0,330864174,-1,2,-1,-58798248,-35145657,-57846937,-83876626,-84429649,-60397113,...,0,0,0,0,0,0,0,0,0,0
1,217809018,28,2,-1,-142754704,-66621324,-70204174,-54154078,-77963997,-23064236,...,0,0,0,0,0,0,0,0,0,0
2,20671231,30,2,-1,-133742557,-40836944,-22798006,-60443553,-88759692,-125303834,...,0,0,0,0,0,0,0,0,0,0
3,366089932,15,1,-1,-116871226,-133791433,-83985614,-126506885,-142918020,-122181832,...,0,0,0,0,0,0,0,0,0,0
4,399763476,-1,2,-1,-39399586,-135646310,-58170807,-100954660,-15326149,-111205860,...,0,0,0,0,0,0,0,0,0,0


In [53]:
X_ages = np.zeros([num, len(Ages)+1], dtype=np.float16)
X_sex = np.zeros([num, genders.shape[0]], dtype=np.float16)
X_pref = np.zeros([num, preferences.shape[0]], dtype=np.float16)
X_publ = np.zeros([num, publics.shape[0]], dtype=np.float16)

In [54]:
for i in range(num):
    print(str(i)+'/'+str(num), end='\r')
    age = data_test.iloc[i, 1]
    sex = data_test.iloc[i, 2]
    pref = data_test.iloc[i, 3]
    publ = data_test.iloc[i, 4:]
    
    for a in range(len(Ages)): # filling X_ages
        if age < Ages[a]:
            X_ages[i, a] = 1
            break
        elif age >= Ages[len(Ages)-1]:
            X_ages[i, len(Ages)] = 1
            break
    
    for j in range(genders.shape[0]): # filling genders
        if sex == genders[j]:
            X_sex[i, j] = 1
    
    for j in range(preferences.shape[0]): # filling preferences
        if pref == preferences[j]:
            X_pref[i, j] = 1
    
    for j in range(publ.shape[0]):
        if publ.values[j] == 0:
            break
        publ_index = np.where(publics == publ.values[j])[0]
        X_publ[i, publ_index] = activaton(j)

1506/1507

In [55]:
X_test = np.hstack((X_ages, X_sex, X_pref, X_publ)) # merging all arrays into one
X_test.shape

(1507, 127256)

In [56]:
X_test_pca = pca.transform(X_test)

In [62]:
X_test_pca.shape

(1507, 14)

In [57]:
np.save('trainPutNavPCA.npy', X_pca)
np.save('testPutNavPCA.npy', X_test_pca)

In [67]:
y = np.append(np.zeros(nav_num), np.ones(nav_num))
y_test = np.append(np.zeros(put_test_num), np.ones(nav_test_num))

In [68]:
y.shape

(3914,)

In [127]:
# NN hyperparameters
batch_size = 600
epochs = 200
dropout_size = 0.5

# hidden layers dimensions
hidden_first = 50
hidden_second = 25
hidden_third = 5

In [128]:
model = Sequential()
model.add(Dense(hidden_first, input_dim=n_components, activation='relu'))
Dropout(dropout_size)
model.add(Dense(hidden_second, activation='relu'))
Dropout(dropout_size/2)
model.add(Dense(hidden_third, activation='relu'))
Dropout(dropout_size/3)
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_pca, y, batch_size=batch_size, epochs=epochs, shuffle=True, 
          validation_data=(X_test_pca[430:, :], y_test[430:]))

Acc = []
Tpr = []
Tnr = []
Ppv = []
Npv = []
F1 = []

y_pred = model.predict(X_test_pca[430:, :]).round().reshape(-1)
tp = (y_pred * y_test[430:]).sum() # predicted - 1, actual - 1
tn = ((1-y_pred) * (1-y_test[430:])).sum() # predicted - 0, actual - 0
fp = (y_pred * (1-y_test[430:])).sum() # predicted - 1, actual - 0
fn = ((1-y_pred) * y_test[430:]).sum() # predicted - 0, actual - 1

acc = (tp + tn) / (tp + tn + fp + fn)
tpr = tp / (tp + fn)
tnr = tn / (tn + fp)
ppv = tp / (tp + fp)
npv = tn / (tn + fn)
f1 = (2 * tpr * ppv) / (tpr + ppv)

Acc.append(int(acc * 1000) / 10.0)
Tpr.append(int(tpr * 1000) / 10.0)
Tnr.append(int(tnr * 1000) / 10.0)
Ppv.append(int(ppv * 1000) / 10.0)
Npv.append(int(npv * 1000) / 10.0)
F1.append(int(f1 * 1000) / 1000.0)

Acc = np.array(Acc).astype(float)
Tpr = np.array(Tpr).astype(float)
Tnr = np.array(Tnr).astype(float)
Ppv = np.array(Ppv).astype(float)
Npv = np.array(Npv).astype(float)
F1 = np.array(F1).astype(float)

Train on 3914 samples, validate on 1077 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200


Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200


Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200


Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


array([0.785])

In [109]:
Acc

array([81.6])

In [111]:
F

[0.715,
 0.79,
 0.738,
 0.74,
 0.755,
 0.741,
 0.793,
 0.753,
 0.752,
 0.762,
 0.753,
 0.745,
 0.742,
 0.758,
 0.754,
 0.752,
 0.755,
 0.753,
 0.742,
 0.759,
 0.744,
 0.762,
 0.753,
 0.767,
 0.753,
 0.758,
 0.762,
 0.755,
 0.764,
 0.768,
 0.763,
 0.766,
 0.762,
 0.764,
 0.779,
 0.771,
 0.786,
 0.752,
 0.763,
 0.769,
 0.767,
 0.758,
 0.776,
 0.751,
 0.776,
 0.791,
 0.776,
 0.769]

In [112]:
accuracy

[77.2,
 81.8,
 78.5,
 78.8,
 79.6,
 78.7,
 82.1,
 79.4,
 79.4,
 80.1,
 79.5,
 79.1,
 78.8,
 79.8,
 79.6,
 79.5,
 79.6,
 79.5,
 78.8,
 80.0,
 78.9,
 80.1,
 79.5,
 80.5,
 79.4,
 79.9,
 80.1,
 79.6,
 80.2,
 80.5,
 80.1,
 80.4,
 80.1,
 80.2,
 81.2,
 80.6,
 81.8,
 79.4,
 80.1,
 80.5,
 80.5,
 79.8,
 81.0,
 79.4,
 81.0,
 81.9,
 80.9,
 80.5]