In [69]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import string
from keras.preprocessing.text import Tokenizer 
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Flatten, Conv1D, MaxPooling1D, Dropout

In [41]:
data = pd.read_csv('./updated_train.csv')
data.drop(columns=['ID'],axis=1, inplace=True)
data.columns

Index(['text', 'target'], dtype='object')

In [42]:
data.head()

Unnamed: 0,text,target
0,The bitcoin halving is cancelled due to,1
1,MercyOfAllah In good times wrapped in its gran...,0
2,266 Days No Digital India No Murder of e learn...,1
3,India is likely to run out of the remaining RN...,1
4,In these tough times the best way to grow is t...,0


In [43]:
def process_test(doc):
    tokens = doc.split()
    re_punc = re.compile(' [%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub(' ' , w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [44]:
data['text'] = data['text'].apply(lambda x: process_test(x))

In [45]:
data.head()

Unnamed: 0,text,target
0,The bitcoin halving cancelled due,1
1,MercyOfAllah In good times wrapped granular de...,0
2,Days No Digital India No Murder learning No on...,1
3,India likely run remaining RNA kits essential ...,1
4,In tough times best way grow learn case teach ...,0


In [32]:
def create_tokens(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [46]:
docs = []
labels = []
for idx,row in data.iterrows():
    docs.append(row['text'])
    labels.append(row['target'])

In [49]:
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.3, random_state=42)

# DeepLearning

In [52]:
tokenizer = create_tokens(X_train)

In [53]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

11055

In [54]:
max_length = max([len(s.split()) for s in X_train])

In [56]:
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post' )
    return padded

In [63]:
train_set = encode_docs(tokenizer, max_length, X_train)

In [94]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 200, input_length=max_length))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.8))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [95]:
model = define_model(vocab_size, max_length)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 48, 200)           2211000   
_________________________________________________________________
dropout_9 (Dropout)          (None, 48, 200)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 41, 32)            51232     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 20, 32)            0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 20, 32)            0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 640)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)               

In [96]:
model.fit(train_set, y_train, epochs=10, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x21f59f88888>

In [97]:
test_set = encode_docs(tokenizer, max_length, X_test)

In [98]:
_, acc = model.evaluate(test_set, y_test, verbose=1)



In [91]:
acc

0.8733459115028381

# MachineLearning

### MultinomialNB

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [105]:
tfidf = TfidfVectorizer()
vectorizer = tfidf.fit(X_train)
train_set = vectorizer.transform(X_train)
test_set = vectorizer.transform(X_test)

In [106]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(train_set, y_train)

MultinomialNB()

In [108]:
preds = nb_classifier.predict(test_set)

In [110]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

In [111]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       814
           1       0.90      0.83      0.87       773

    accuracy                           0.87      1587
   macro avg       0.88      0.87      0.87      1587
weighted avg       0.88      0.87      0.87      1587



In [112]:
print(metrics.accuracy_score(y_test, preds))

0.8746061751732829


In [113]:
from sklearn.model_selection import GridSearchCV

In [114]:
param_grid = {'alpha': [0.001, 0.01, 0.1,1]}

In [115]:
nb_gsc = GridSearchCV(nb_classifier, param_grid, verbose=1, cv=10, n_jobs=-1)

In [117]:
fit= nb_gsc.fit(train_set, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished


In [119]:
best_params = fit.best_params_
best_params

{'alpha': 1}

### LogisticRegression

In [120]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(train_set, y_train)

LogisticRegression(random_state=0)

In [121]:
preds = lr_classifier.predict(test_set)

In [122]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.95      0.82      0.88       814
           1       0.83      0.95      0.89       773

    accuracy                           0.88      1587
   macro avg       0.89      0.89      0.88      1587
weighted avg       0.89      0.88      0.88      1587



In [123]:
print(metrics.accuracy_score(y_test, preds))

0.8834278512917454


In [132]:
param_grid = {'penalty':['l1','l2'], 'C': [0.001,0.01,0.3,1]}

In [133]:
lr_gsc = GridSearchCV(lr_classifier, param_grid, verbose=1, cv=10, n_jobs=-1)

In [134]:
fit= lr_gsc.fit(train_set, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    1.0s finished


In [135]:
best_params = fit.best_params_
best_params

{'C': 1, 'penalty': 'l2'}