In [95]:
%matplotlib inline
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import pickle

In [96]:
from keras.models import Model
from keras import layers
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam

In [98]:
train = pd.read_csv('train.csv', low_memory=False)
train_id = pd.read_csv('train.csv', low_memory=False, usecols = ['id'])
test_id  = pd.read_csv('test.csv', low_memory=False, usecols = ['id'])
embedding_matrix = pd.read_csv('data_process/embedding_matrix').values

In [50]:
totaldata = pd.read_csv('totaldata.csv',low_memory=False)

In [51]:
totaldata.shape

(10766, 51)

In [52]:
train = totaldata[totaldata['target'] != 2]
test = totaldata[totaldata['target'] == 2]

In [53]:
train_target = train.loc[train['text'].drop_duplicates().index,'target']

In [54]:
del totaldata['id']
del train['id']
del test['id']
del train['target']
del test['target']
del train['text']
del test['text']
del totaldata['text']

In [55]:
scaled = MinMaxScaler()

In [56]:
scaled_train = scaled.fit_transform(train)
scaled_test = scaled.fit_transform(test)

In [57]:
train_features = pd.DataFrame(scaled_train, columns= train.columns)
test_features = pd.DataFrame(scaled_test, columns= test.columns)

In [58]:
num_words = embedding_matrix.shape[0]

In [73]:
features_input = layers.Input(shape=(48,), name="features")
x = layers.Dense(128, activation='relu')(features_input)
x = layers.Dropout(0.2)(x)
x = layers.Dense(128, activation='relu')(features_input)
features_output = layers.Dropout(0.2)(x)

emb_input = layers.Input(shape=(None,), name="embedd")
x= layers.Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=100,trainable=False)(emb_input)
x = layers.SpatialDropout1D(0.08)(x)
x = layers.Conv1D(128, 7, padding="valid", activation='relu', strides=4)(x)
x = layers.Dropout(0.08)(x)
x = layers.Conv1D(128, 7, padding="valid", activation='relu', strides=4)(x)
emb_output = layers.Dropout(0.1)(x)
emb_output = layers.GlobalMaxPooling1D()(x)

x = layers.concatenate([features_output, emb_output])
x = layers.Dense(64,activation='relu')(x)
x = layers.Dropout(0.12)(x)
x = layers.Dense(16,activation='relu')(x)
Tx = layers.Dropout(0.2)(x)
conv1d = layers.Dense(1, activation='sigmoid')(x)

model4 = Model(
    inputs=[features_input, emb_input],
    outputs=[conv1d],
)
optimzer=Adam(learning_rate=1e-5)
model4.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [99]:
pickle_in = open("data_process/tweet_pad.pickle","rb")
tweet_pad = pickle.load(pickle_in)

In [75]:
embedd_train =tweet_pad[:train.shape[0]]
embedd_test=tweet_pad[train.shape[0]:]

In [76]:
embedd_train.shape, embedd_test.shape, train_features.shape, test_features.shape

((7503, 100), (3263, 100), (7503, 48), (3263, 48))

In [77]:
embedd_test = pd.DataFrame(embedd_test)
embedd_train = pd.DataFrame(embedd_train)

In [78]:
test = pd.concat([embedd_test,test_features],axis=1)
train = pd.concat([embedd_train,train_features],axis=1)

In [79]:
train.isna().any().any() , test.isna().any().any()

(False, False)

In [80]:
train.shape

(7503, 148)

In [81]:
X_train,X_test,y_train,y_test=train_test_split(train.values,train_target.values,test_size=0.30)

In [82]:
X_train.shape

(5252, 148)

In [83]:
X_train[:,100:].shape

(5252, 48)

In [84]:
history=model4.fit(
    {"features": X_train[:,100:], "embedd": X_train[:,:100]},
    y_train,
    batch_size=10,
    epochs=30,
    validation_data=(
        {"features": X_test[:,100:], "embedd": X_test[:,:100]},
        y_test
    ),
    verbose=1
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [85]:
model4.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedd (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    1991900     embedd[0][0]                     
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, None, 100)    0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, None, 128)    89728       spatial_dropout1d_2[0][0]        
_______________________________________________________________________________________

In [100]:
predict = model4.predict([test_features,embedd_test])
predict=np.round(predict).astype(int).reshape(3263)
submit = pd.read_csv('test.csv', usecols= ['id'])
submit['target'] = predict
submit.to_csv('submits/submit7.csv',index=False)

In [94]:
submit['target'].value_counts()

0    2007
1    1256
Name: target, dtype: int64