## Import libraries

In [1]:
import pandas as pd
from nltk.corpus import stopwords as s 
import re
from nltk.tokenize import word_tokenize  
from scipy.stats import chi2_contingency 
from nltk.stem import WordNetLemmatizer
import keras
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
import numpy as np
from keras.preprocessing.text import one_hot

## Data exploration and cleaning

In [2]:
traindata = pd.read_csv("train.csv")
testdata = pd.read_csv("test.csv")
data_copy = traindata.copy()
print(traindata.shape)
traindata.head(5)

In [3]:
traindata['label'].value_counts() 

1    3796
0    3599
Name: label, dtype: int64

#### Conclusion: the data does not have a class imbalance problem

In [4]:
#check missing values

traindata.isnull().sum().sort_values(ascending=False)

label                             0
frameTagRatio                     0
alchemy_category                  0
alchemy_category_score            0
avglinksize                       0
commonlinkratio_1                 0
commonlinkratio_2                 0
commonlinkratio_3                 0
commonlinkratio_4                 0
compression_ratio                 0
embed_ratio                       0
framebased                        0
hasDomainLink                     0
spelling_errors_ratio             0
html_ratio                        0
image_ratio                       0
is_news                           0
lengthyLinkDomain                 0
linkwordscore                     0
news_front_page                   0
non_markup_alphanum_characters    0
numberOfLinks                     0
numwords_in_url                   0
parametrizedLinkRatio             0
boilerplate                       0
dtype: int64

In [10]:
#choose features: boilerplate and label

traindata = traindata.iloc[:,[2,26]]
testdata = testdata.iloc[:,2]

In [5]:
#remove stopwords, punctuations and numbers

def clean_data(data):
    stopwords = s.words('english')
    for i in range (data.shape[0]):
        words_to_remove = ['''"title"''','''"body"''','''"url"''']
        for w in words_to_remove:
            data['boilerplate'].loc[i] = (data['boilerplate'].loc[i]).replace(w, "")
        data['boilerplate'].loc[i] = re.sub(r'[^\w\s]',"", data['boilerplate'].loc[i])
        word_tokens = word_tokenize (data['boilerplate'].loc[i])
        data['boilerplate'].loc[i]=" ".join(filter(lambda x: x not in stopwords , word_tokens))
        data['boilerplate'].loc[i]= re.sub('\d',"", data['boilerplate'].loc[i])
    return data

In [33]:
traindata = clean_data (traindata)

In [6]:
testdata = clean_data (testdata)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [18]:
traindata.to_csv('C:/Users/Gunisha Chaturvedi/Dropbox/My PC (LAPTOP-1CT80JS5)/Documents/internships/traindata.csv')

In [19]:
traindata = pd.read_csv('C:/Users/Gunisha Chaturvedi/Dropbox/My PC (LAPTOP-1CT80JS5)/Documents/internships/traindata.csv')

In [20]:
r.shape

(3171, 1)

### Check correlation of alchemy_category with label using chi-squared test

In [9]:
l=list(traindata['alchemy_category'])
l.count('?')

2342

In [64]:
pd.crosstab(data['label'],data['alchemy_category'])

alchemy_category,?,arts_entertainment,business,computer_internet,culture_politics,gaming,health,law_crime,recreation,religion,science_technology,sports,unknown,weather
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1166,591,254,223,186,48,216,18,388,42,157,302,4,4
1,1176,350,626,73,157,28,290,13,841,30,132,78,2,0


In [74]:
# h0 = no relationship bw variables

cont_table = [[591,254,223,186,48,216,18,388,42,157,302,4,4], [350,626,73,157,28,290,13,841,30,132,78,2,0]] 
stat, p, dof, expected = chi2_contingency(cont_table) 
  
alpha = 0.05
print("p value is " + str(p)) 
if p <= alpha: 
    print('Dependent (reject H0)') 
else: 
    print('Independent (H0 holds true)') 

p value is 3.857352828735912e-124
Dependent (reject H0)


#### Observation: length of boiler plate code varies a lot 

In [27]:
#average length of boilerplate text from training data

length=[]
for i in range (0, traindata.shape[0]):
    length.append(len(traindata['boilerplate'].loc[i]))

sum(l)/len(l)

1956.3871534820826

## RNN Model

In [8]:
xtrain, ytrain = traindata['boilerplate'], traindata['label']

xtest = testdata['boilerplate']

vocab_size, embed_size, max_length = 20000, 1000, 1956

model = tf.keras.Sequential()

encoded_docs_train = [one_hot(d,vocab_size) for d in xtrain]

encoded_docs_test = [one_hot(d,vocab_size) for d in xtest]

xtrain = tf.keras.preprocessing.sequence.pad_sequences(encoded_docs_train, maxlen= max_length)

xtest = tf.keras.preprocessing.sequence.pad_sequences(encoded_docs_test, maxlen= max_length)

model.add(tf.keras.layers.Embedding(vocab_size, embed_size, input_length= max_length))

model.add(tf.keras.layers.LSTM (units=80, activation='tanh'))

model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

model.summary()

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(xtrain, ytrain, epochs=5, batch_size=128)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1956, 1000)        20000000  
_________________________________________________________________
lstm (LSTM)                  (None, 80)                345920    
_________________________________________________________________
dense (Dense)                (None, 1)                 81        
Total params: 20,346,001
Trainable params: 20,346,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x266655ab320>

In [None]:
yresult = model.predict_classes(xtest)