In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import string
from sklearn.preprocessing import OneHotEncoder
import textblob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping
import os
print(os.listdir("../input"))

In [None]:
train_df = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/train.tsv", sep="\t")
test_df = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/test.tsv", sep="\t")
pos_df = pd.read_csv("../input/pos-neg-files/positive words.txt", sep="\n", header=None)
neg_df = pd.read_csv("../input/pos-neg-files/Negative words.txt", sep="\n", header=None, encoding = "ISO-8859-1")
pos_df.columns = ['words']
neg_df.columns = ['words']

In [None]:
train_df.head()

In [None]:
print(train_df['Sentiment'].value_counts())
sns.countplot(train_df['Sentiment'])

In [None]:
train_df['Phrase'][0]

In [None]:
stop_words = set(stopwords.words('english'))

for df in [train_df, test_df]:
    df['words_length'] = df['Phrase'].apply(lambda x: len(x))
    df['sent_length'] = df['Phrase'].apply(lambda x: len(word_tokenize(x)))
    df['no_stops'] = df['Phrase'].apply(lambda x: len([w for w in word_tokenize(x.lower()) if w in stop_words]))
    df['no_non_stops'] = df['Phrase'].apply(lambda x: len([w for w in word_tokenize(x.lower()) if w not in stop_words]))
    df['no_punctuations'] = df['Phrase'].apply(lambda x: 
                                               len([w for w in word_tokenize(x.lower()) if w in string.punctuation if w not in "." 
                                                   if w not in ","]))
    
    df['pos_words'] = df['Phrase'].apply(lambda x: len([w for w in word_tokenize(x.lower()) if w in pos_df.words.values]))
    df['neg_words'] = df['Phrase'].apply(lambda x: len([w for w in word_tokenize(x.lower()) if w in neg_df.words.values]))
    df['neutral_words'] = df['Phrase'].apply(lambda x: len([w for w in word_tokenize(x.lower()) if w not in neg_df.words.values
                                                           if w not in pos_df.words.values]))
    df["Phrase"] = df["Phrase"].apply(lambda x: x.lower())

In [None]:
train_df.head(2)

In [None]:
dense_features = ["words_length", "sent_length", "no_stops", "no_non_stops", "no_punctuations", "pos_words", "neg_words", "neutral_words"]
train_df.groupby("Sentiment")[dense_features].mean()

In [None]:
tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec.fit(np.append(train_df["Phrase"].values, test_df["Phrase"].values))

x_train_tfidf = tvec.transform(train_df["Phrase"].values)
x_test_tfidf = tvec.transform(test_df["Phrase"].values)

In [None]:
print(x_train_tfidf.shape)
print(x_test_tfidf.shape)

In [None]:
svd = TruncatedSVD(n_components = 20)
x_train_svd = svd.fit_transform(x_train_tfidf)
x_test_svd = svd.fit_transform(x_test_tfidf)

In [None]:
print(x_train_svd.shape)
print(x_test_svd.shape)

In [None]:
enc = OneHotEncoder(sparse=False)
enc.fit(train_df["Sentiment"].values.reshape(-1, 1))

In [None]:
print("train set: {0:.2f}".format(len(set(train_df["Phrase"]).intersection(set(test_df["Phrase"])))/test_df.shape[0]))
test_new = pd.merge(test_df, train_df[["Phrase", "Sentiment"]], on="Phrase", how="left")
cv1 = CountVectorizer()
cv1.fit(train_df["Phrase"])
cv2 = CountVectorizer()
cv2.fit(test_df["Phrase"])
print("Train Set Vocabulary Size:", len(cv1.vocabulary_))
print("Test Set Vocabulary Size:", len(cv2.vocabulary_))
print("Number of Words that occur in both:", 
      len(set(cv1.vocabulary_.keys()).intersection(set(cv2.vocabulary_.keys()))))

In [None]:
print("total length of test data: %d"%(len(test_df)))
print("No of Phrases in test which are in train: %d"%(len(test_new) - (test_new['Sentiment'].isna().sum())))
print("No of Phrases in test which are not in train: %d"%(test_new['Sentiment'].isna().sum()))

In [None]:
NUM_FOLDS = 5
train_df["fold_id"] = train_df["SentenceId"].apply(lambda x: x%NUM_FOLDS)
EMBEDDING_FILE = "../input/glove6b100dtxt//glove.6B.100d.txt"
EMBEDDING_DIM = 100
all_words = set(cv1.vocabulary_.keys()).union(set(cv2.vocabulary_.keys()))

In [None]:
def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in all_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index

embeddings_index = get_embedding()
print("Number of words that don't exist in GLOVE:", len(all_words - set(embeddings_index)))

In [None]:
MAX_SEQUENCE_LENGTH = 60

tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.append(train_df["Phrase"].values, test_new["Phrase"].values))
word_index = tokenizer.word_index

nb_words = len(word_index) + 1
embedding_matrix = np.random.rand(nb_words, EMBEDDING_DIM + 2)

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    sent = textblob.TextBlob(word).sentiment
    if embedding_vector is not None:
        embedding_matrix[i] = np.append(embedding_vector, [sent.polarity, sent.subjectivity])
    else:
        embedding_matrix[i, -2:] = [sent.polarity, sent.subjectivity]
        
old_seq = pad_sequences(tokenizer.texts_to_sequences(train_df["Phrase"]), maxlen=MAX_SEQUENCE_LENGTH)
test_seq = pad_sequences(tokenizer.texts_to_sequences(test_new["Phrase"]), maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
def build_model():
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM + 2,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    dropout = SpatialDropout1D(0.2)
    mask_layer = Masking()
    lstm_layer = LSTM(100)
    
    seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    dense_input = Input(shape=(len(dense_features),))
    svd_input = Input(shape=(20,))
    
    dense_vector = BatchNormalization()(dense_input)
    svd_vector = BatchNormalization()(svd_input)
    
    phrase_vector = lstm_layer(mask_layer(dropout(embedding_layer(seq_input))))
    
    feature_vector = concatenate([phrase_vector, dense_vector, svd_vector ])
    feature_vector = Dense(128, activation="relu")(feature_vector)
    feature_vector = Dense(128, activation="relu")(feature_vector)
    feature_vector = Dense(64, activation="relu")(feature_vector)
    
    output = Dense(5, activation="softmax")(feature_vector)
    
    model = Model(inputs=[seq_input, dense_input, svd_input], outputs=output)
    return model

In [None]:
test_preds = np.zeros((test_new.shape[0], 5))

for i in range(NUM_FOLDS):
    print("FOLD", i+1)
    
    print("Splitting the data into train and validation...")
    train_seq, val_seq = old_seq[train_df["fold_id"] != i], old_seq[train_df["fold_id"] == i]
    train_dense, val_dense = train_df[train_df["fold_id"] != i][dense_features], train_df[train_df["fold_id"] == i][dense_features]
    train_svd, val_svd = x_train_svd[train_df["fold_id"] != i], x_train_svd[train_df["fold_id"] == i]
    
    y_train = enc.transform(train_df[train_df["fold_id"] != i]["Sentiment"].values.reshape(-1, 1))
    y_val = enc.transform(train_df[train_df["fold_id"] == i]["Sentiment"].values.reshape(-1, 1))
    
    print("Building the model")
    model = build_model()
    model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["acc"])
    early_stopping = EarlyStopping(monitor="val_acc", patience=2, verbose=1)
    
    print("Training the model")
    model.fit([train_seq, train_dense, train_svd], y_train, validation_data=([val_seq, val_dense, val_svd], y_val),epochs=15,
              batch_size=1024, shuffle=True, callbacks=[early_stopping], verbose=1)
    
    print("Predicting...")
    test_preds += model.predict([test_seq, test_new[dense_features], x_test_svd], batch_size=1024, verbose=5)
    print()
    
test_preds /= NUM_FOLDS

In [None]:
test_new["pred"] = test_preds.argmax(axis=1)
test_new.loc[test_new["Sentiment"].isnull(), "Sentiment"] = test_new.loc[test_new["Sentiment"].isnull(), "pred"]
test_new["Sentiment"] = test_new["Sentiment"].astype(int)
test_new[["PhraseId", "Sentiment"]].to_csv("submission.csv", index=False)

reference: https://www.kaggle.com/shubhammank/movie-sentiment