In [144]:
import tensorflow as tf
import pandas as pd

# Tried using Textblob for semantic analysis : Too Intensive
from textblob import TextBlob

# For encoding and vectorization we use scikit
from sklearn import preprocessing, model_selection , metrics
from sklearn.feature_extraction.text import TfidfVectorizer


#For shallow neural network
from keras import layers,models,optimizers


In [216]:
train_data = pd.read_csv('/Users/ayush/train.csv')

# Semanitc Analysis : Takes too long
# 
# copy_article = train_data["text"]
# 
# for i in range(len(copy_article)):
#     semantic = TextBlob(copy_article[i])
#     semantic = semantic.sentiment
#     train_data["text"][i] = semantic

train_data.fillna(' ')

train_DF = pd.DataFrame()

train_DF["text"] = train_data["text"]
train_DF["label"] = train_data["label"]


# Splitting into training and testing
train_x,test_x,train_y,test_y = model_selection.train_test_split(train_DF['text'],train_DF['label'],random_state=0)

In [221]:
# word level tf-idf ( For word to vector)
# code snippet used
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3),max_features=5000)
tfidf_vect.fit(train_DF["text"].values.astype('str'))
xtrain_tfidf =  tfidf_vect.transform(train_x.astype('str'))
xtest_tfidf =  tfidf_vect.transform(test_x.astype('str'))

In [222]:
# Basic model for testing if word to vector works
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)

    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label,epochs = 10, batch_size = 1024)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    
    # 
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    # 
    clarity = list(test_y)
    sum = 0
    for i in range(len(test_y)):
        if(predictions[i] == clarity[i]):
            sum += 1
    print(sum)
    return float(sum/len(test_y))

In [223]:
classifier = create_model_architecture(xtrain_tfidf.shape[1])
accuracy = train_model(classifier, xtrain_tfidf, train_y, xtest_tfidf, is_neural_net=True)

Epoch 1/10


 1024/15600 [>.............................] - ETA: 36s - loss: 0.6933

 2048/15600 [==>...........................] - ETA: 17s - loss: 0.6905

















Epoch 2/10


 1024/15600 [>.............................] - ETA: 1s - loss: 0.5736

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.5658















Epoch 3/10
 1024/15600 [>.............................] - ETA: 0s - loss: 0.4466

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.4386















Epoch 4/10
 1024/15600 [>.............................] - ETA: 0s - loss: 0.3427

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.3457















Epoch 5/10
 1024/15600 [>.............................] - ETA: 0s - loss: 0.3040

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.2945















Epoch 6/10


 1024/15600 [>.............................] - ETA: 0s - loss: 0.2420

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.2367















Epoch 7/10
 1024/15600 [>.............................] - ETA: 0s - loss: 0.2025

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.2115















Epoch 8/10
 1024/15600 [>.............................] - ETA: 0s - loss: 0.2014

 2048/15600 [==>...........................] - ETA: 0s - loss: 0.1902

















Epoch 9/10


 1024/15600 [>.............................] - ETA: 0s - loss: 0.1752

 3072/15600 [====>.........................] - ETA: 0s - loss: 0.1681















Epoch 10/10


 1024/15600 [>.............................] - ETA: 0s - loss: 0.1606

 3072/15600 [====>.........................] - ETA: 0s - loss: 0.1496















2564


In [226]:
# Some Testing
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(xtrain_tfidf, train_y)
print('Accuracy of NB  classifier on training set: {:.2f}'
     .format(NB.score(xtrain_tfidf, train_y)))
print('Accuracy of NB classifier on test set: {:.2f}'
     .format(NB.score(xtest_tfidf, test_y)))

Accuracy of NB  classifier on training set: 0.88
Accuracy of NB classifier on test set: 0.86


In [None]:
clarity = list(test_y)
print(accuracy)