In [1]:
import App
import os
import pickle



In [2]:
UserMaker = App.UserMaker
try:
    with open('G:\\Programming\\major\\data\\users.dat', mode='rb') as file:
        UM = pickle.load(file)
except EOFError as E:
    print(E)
DM = App.DatasetMaker(UM)


In [3]:
from models.User import User, UserData, UserType
from models.Tweet import Tweet
from random import shuffle
id = DM.train_users[0].id

all_train_users = DM.train_users
all_test_users = DM.test_users

shuffle(all_train_users)
shuffle(all_test_users)


In [4]:
import numpy as np


In [5]:

def get_user_from_pickle(id):
    with open('G:\\Programming\\major\\data\\processed_data\\' + id + '.dat', 'rb') as file:
        U = pickle.load(file)
    return U

U = get_user_from_pickle(DM.train_users[0].id)

In [6]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.utils import np_utils
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


In [9]:
batch_size=32
num_epochs=20
num_pool=2
num_filters = 128
conv_kernel_width = 3
conv_kernel_height = 200
lstm_output_size= 70
dropout_rate = 0.1


In [10]:
model = Sequential()
model.add(Conv1D(num_filters,
                 conv_kernel_width,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(20,200)))
model.add(MaxPooling1D(pool_size=num_pool))
model.add(LSTM(lstm_output_size))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [11]:

class DataGenerator:
    
    def __init__(self, dim_x = 20, dim_y = 200, batch_size=32, shuffle=True):
        self.dim_x = dim_x
        self.dim_y = dim_y
        self.batch_size = batch_size
        self.user_batch_size = 100
        self.shuffle = shuffle
        
    def __get_exploration_order(self, user_list):
        indexes = np.arange(len(user_list))
        
        if self.shuffle == True:
            shuffle(indexes)
        
        return indexes
    
    def get_tweets_from_users(self, user_list):
        vector_forms = []
        labels = []
        
        for user in user_list:
            U = get_user_from_pickle(user.id)
            for tweet in U.tweets:
                vector_forms.append(tweet.vector_form)
                labels.append(U.user_type.value)
        
        imax = int(len(labels) / self.batch_size)
        for i in range(imax):
            temp_tweets = vector_forms[i*self.batch_size:(i+1) * self.batch_size]
            temp_labels = labels[i*self.batch_size : (i+1) * self.batch_size]
            
            x, y = self.__data_generation(temp_tweets, temp_labels)
            yield x,y
    
    def generate(self, user_list):
        while 1:
            exploration_order = self.__get_exploration_order(user_list)
            for i in range(len(user_list)):
                temp_users = [user_list[k] for k in exploration_order[i*self.user_batch_size:
                                                                      (i+1)*self.user_batch_size]]
                for item in self.get_tweets_from_users(temp_users):
                    yield item
                
    def __data_generation(self,  tweets, labels):
        
        X = np.empty((self.batch_size, self.dim_x, self.dim_y))
        Y =  np.empty((self.batch_size), dtype=int)
        
        for i, tweet in enumerate(tweets):
            
            X[i, :, :] = np.array(tweet)
            Y[i] = labels[i]
        
        return X, np_utils.to_categorical(Y, 2)

        
    

In [12]:
params = {
    'dim_x' : 20,
    'dim_y' : 200,
    'batch_size' : 32,
    'shuffle' : True
}
batch_size = 32
DM.train_tweets = sum([user.num_tweets for user in DM.train_users])
DM.test_tweets = sum([user.num_tweets for user in DM.test_users])
validation_generator = DataGenerator(**params).generate(DM.test_users)
training_generator = DataGenerator(**params).generate(DM.train_users)

In [49]:
for item in training_generator:
    x,y = item
    print(x.shape, y.shape)

(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 200) (32, 2)
(32, 20, 2

KeyboardInterrupt: 

In [13]:
model.fit_generator(generator = training_generator,
                    steps_per_epoch = DM.train_tweets//batch_size,
                    validation_data = validation_generator,
                    validation_steps = DM.test_tweets//batch_size,
                    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x155891b6c88>

In [14]:
model.save('G:\\Programming\\major\\trained_model')

<UserType.HUMAN: 0>