In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding , Bidirectional , GlobalMaxPool1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import text_to_word_sequence
import copy
import re
from nltk import WordNetLemmatizer
%matplotlib inline

Using TensorFlow backend.


In [2]:
import preprocessing_ap

In [3]:
df_train=pd.read_csv('train_data.csv')
df_test=pd.read_csv('test_data.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6251 entries, 0 to 6250
Data columns (total 4 columns):
url         6251 non-null object
label       6251 non-null int64
corpus      6251 non-null object
dateTime    6251 non-null object
dtypes: int64(1), object(3)
memory usage: 195.4+ KB


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1563 entries, 0 to 1562
Data columns (total 4 columns):
url         1563 non-null object
label       1563 non-null int64
corpus      1563 non-null object
dateTime    1563 non-null object
dtypes: int64(1), object(3)
memory usage: 48.9+ KB


In [39]:
df_train['corpus']=df_train['corpus'].apply(eval).apply(preprocessing_ap.listToString)
df_test['corpus']=df_test['corpus'].apply(eval).apply(preprocessing_ap.listToString)

In [40]:
X_train=df_train['corpus'].values
X_test=df_test['corpus'].values
Y_train = np.asarray(df_train['label'].values).astype('float32')
Y_test = np.asarray(df_test['label'].values).astype('float32')

In [41]:
max_words = 20000
max_len = 150
#It is the process of separating each word in a text as a unit and you can later
#you use the tokenize data for things like term frequency and word clouds
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
#******************************
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [42]:
sequences_matrix

array([[   2,  201, 2228, ..., 1609,  757,  590],
       [   0,    0,    0, ...,    0,    0, 1404],
       [   0,    0,    0, ..., 2544, 1993, 3281],
       ...,
       [2250, 5202, 3746, ..., 1571,  198,   87],
       [   8,   85, 1311, ...,  117,   30,    3],
       [1292, 3706, 1966, ...,  697,  346,  378]])

In [43]:
def MyBaseline_Model(maxlen,max_features):
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(100, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(100, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [44]:
model = MyBaseline_Model(max_len,max_words)
model.summary() 

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 128)          2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 150, 200)          183200    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0   

In [45]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2
          #,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]
         )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 5000 samples, validate on 1251 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a78d486f98>

In [46]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [47]:
test_sequences_matrix

array([[   74,  2488,   417, ...,   597,   935,    70],
       [    0,     0,     0, ...,   278,   258,   613],
       [    0,     0,     0, ...,  3663,   974,  1137],
       ...,
       [    0,     0,     0, ...,     0,     0,     0],
       [    9,  2839,   205, ...,   303,   224,   116],
       [ 1051, 12894,  3408, ...,  8110,  4044,  5072]])

In [48]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [49]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.461
  Accuracy: 0.836


In [50]:
predictions=model.predict(test_sequences_matrix)

In [51]:
predictions.shape

(1563, 1)

In [52]:
predictions=predictions.reshape(1563,)
predictions=list(map(lambda x: 1 if x > 0.5 else 0 , predictions))

In [53]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83       800
         1.0       0.79      0.91      0.84       763

    accuracy                           0.84      1563
   macro avg       0.84      0.84      0.84      1563
weighted avg       0.84      0.84      0.83      1563



In [54]:
import scrapping_ap as sap
import preprocessing_ap as pap

In [57]:
def predict(url):
    corpus=sap.scrap_raw_text(url)
    corpus=pap.text_preprocessing(corpus)
    tokenizer=pap.PatternTokenizer()
    corpus=' '.join(corpus)
    corpus=tokenizer.process_text(corpus)
    corpus=[corpus]
    c = tok.texts_to_sequences(corpus)
    c = sequence.pad_sequences(c,maxlen=max_len)
    prediction= model.predict(c)

In [58]:
predict('https://www.xvideos.com')

scraping from https://www.xvideos.com




  soup = BeautifulSoup(page)


In [59]:
url="https://www.xvideos.com"
corpus=sap.scrap_raw_text(url)
corpus=pap.text_preprocessing(corpus)
tokenizer=pap.PatternTokenizer()
corpus=' '.join(corpus)
corpus=tokenizer.process_text(corpus)

scraping from https://www.xvideos.com




  soup = BeautifulSoup(page)


In [60]:
corpus

['html',
 'free',
 'porn',
 'video',
 'xvideos',
 'com',
 'ie',
 'endif',
 'xvideos',
 'com',
 'search',
 'best',
 'free',
 'porn',
 'site',
 'best',
 'free',
 'porn',
 'site',
 'search',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'm',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'min',
 'view',
 'sec',
 'k',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'k',
 'view',
 'min',
 'k',
 'view',
 'min',
 'k',
 'view',
 'min',
 'view',
 'min',
 'k',


In [43]:
corpus=[corpus]

In [61]:
c = tok.texts_to_sequences(corpus)
c = sequence.pad_sequences(c,maxlen=max_len)

In [62]:
model.predict(c)

array([[0.6457365 ],
       [0.86379725],
       [0.9799668 ],
       [0.8671093 ],
       [0.7482274 ],
       [0.72045314],
       [0.5577022 ],
       [0.7022901 ],
       [0.7482274 ],
       [0.72045314],
       [0.57713723],
       [0.211813  ],
       [0.86379725],
       [0.9799668 ],
       [0.64754987],
       [0.211813  ],
       [0.86379725],
       [0.9799668 ],
       [0.64754987],
       [0.57713723],
       [0.7253252 ],
       [0.609673  ],
       [0.7253252 ],
       [0.8814692 ],
       [0.609673  ],
       [0.7253252 ],
       [0.609673  ],
       [0.7253252 ],
       [0.609673  ],
       [0.7253252 ],
       [0.609673  ],
       [0.7253252 ],
       [0.609673  ],
       [0.7253252 ],
       [0.8814692 ],
       [0.609673  ],
       [0.7253252 ],
       [0.609673  ],
       [0.7253252 ],
       [0.8814692 ],
       [0.609673  ],
       [0.7253252 ],
       [0.34388524],
       [0.609673  ],
       [0.7253252 ],
       [0.8814692 ],
       [0.609673  ],
       [0.725