# Text classification of tweets:

In [1]:
from __future__ import print_function

from implementations import *
from cnn import *
import numpy as np
import csv
import random

import gensim
from gensim.models import Word2Vec, Doc2Vec, FastText, KeyedVectors
import gensim.parsing.preprocessing as prep

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_validate
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import datasets, svm
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, concatenate, Activation, Dropout, Flatten
from keras.models import Model, load_model, Sequential
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


## Create word embedding based on full set:

In [2]:
features = 200 # number of features in word embedding
epoch = 10 # number of epochs for word embedding

#all_tweets_full, y_full = process_set('train_neg_full.txt', 'train_pos_full.txt') # corpus for word embedding
#model_tot_200 = createWordEmbedding(all_tweets_full, features, epoch) # creates word embedding 
#model_tot_200.save('model_tot_200.word2vec') # saves word embedding
#model_tot = Word2Vec.load('model_tot_200.word2vec') # loads word embedding

# Cross validation for word2vec + LR and tfidf + LR:

In [5]:
# LR_with_w2v(model_tot, 'test_data.txt', 'train_neg.txt', 'train_pos.txt') # LR and word2vec
# tfidf('test_data.txt', 'train_neg_full.txt', 'train_pos_full.txt') # TF-IDF with full set

CNN('model_tot_200.word2vec', 'train_neg.txt', 'train_pos.txt', 512, 'test_data.txt', 1, 20000, features)

start
Train on 180000 samples, validate on 20000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.80280, saving model to cnn_result01_0.80.hdf5


# CNN

In [None]:
list_all_tweets, y_full = easyProcess_set('train_neg_full.txt', 'train_pos_full.txt')
SEED = 42
x_train, x_validation, y_train, y_validation = train_test_split(
    list_all_tweets, y_full, test_size=.1, random_state=SEED)

model_tot_k = KeyedVectors.load('model_tot_200.word2vec')

embed_index = {}
for w in model_tot_k.wv.vocab.keys():
    embed_index[w] = model_tot_k.wv[w] # get index of words

word_count = 100000
tokenizer = Tokenizer(word_count)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

#Finds longest tweet:
length = []
for x in x_train:
    length.append(len(x.split()))
length = max(length)+2

x_train_seq = pad_sequences(sequences, maxlen=length)

sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=length)

embed_matrix = np.zeros((word_count, features))
for word, i in tokenizer.word_index.items():
    if i >= word_count:
        continue
    embed_vector = embed_index.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector
        
hFeatures = int(features * 0.5)

In [None]:
tweet_input = Input(shape=(length,), dtype='int32')

tweet_encoder = Embedding(word_count, features, weights=[embed_matrix], input_length=length, trainable=True)(tweet_input)
k2 = Conv1D(filters=hFeatures, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
k2 = GlobalMaxPooling1D()(k2)
k4 = Conv1D(filters=hFeatures, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
k4 = GlobalMaxPooling1D()(k4)
k6 = Conv1D(filters=hFeatures, kernel_size=6, padding='valid', activation='relu', strides=1)(tweet_encoder)
k6 = GlobalMaxPooling1D()(k6)

merged = concatenate([k2, k4, k6], axis=1)
merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

In [None]:
filepath = "CNN128_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(
    filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(
    x_train_seq,
    y_train,
    batch_size=512,
    epochs=4,
    validation_data=(x_val_seq, y_validation),
    callbacks=[checkpoint])

In [None]:
model.evaluate(x=x_val_seq, y=y_validation)

In [None]:
loaded_CNN_model.summary()

In [None]:
loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)

### Predict test set

In [None]:
testd_tweets = open_file('test_data.txt')
testd_tweets = easyProcess(testd_tweets)

sequences_test = tokenizer.texts_to_sequences(testd_tweets)
x_test_seq = pad_sequences(sequences_test, maxlen=length)
ycnn02 = loaded_CNN_model.predict(x=x_test_seq)

ycnn02_rounded = (np.around(ycnn02)).flatten()
ycnn02_rounded[ycnn02_rounded == 0] = -1
save_csv('test_resultCNN.csv', ycnn02_rounded)
print(ycnn02)
print(ycnn02_rounded)