In [2]:
import warnings
import pandas as pd
from tqdm import trange, tqdm
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df1 = pd.read_csv('./datasets/labeledTrainData.tsv', delimiter="\t")
df1 = df1.drop(['id'], axis=1)
df2 = pd.read_csv('./datasets/imdb_master.csv',encoding="latin-1")
df2['review'] = df2.review.apply(lambda x: clean_text(x))
df2 = df2[df2.label != 'unsup']
df2['label'].replace('neg', 0, inplace=True)
df2['label'].replace('pos', 1, inplace=True)
df2 = df2.drop(columns=[df2.keys()[0], df2.keys()[4]])
df2 = df2.rename(columns={'label':'sentiment'})
df2_train, df2_test = df2[df2['type']=='train'].drop(columns=['type']), df2[df2['type']=='test'].drop(columns=['type'])
df2_train, df2_test = df2_train.reset_index(drop=True), df2_test.reset_index(drop=True)
df2_train.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\86155\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,sentiment
0,story man ha unnatural feel pig start open sce...,0
1,airport 77 start brand new luxury 747 plane lo...,0
2,film lack something couldnt put finger first c...,0
3,sorry everyone know suppose art film wow hand ...,0
4,wa little parent take along theater see interi...,0


In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df2_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(df2_train['review'])

maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = df2_train['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 8
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.callbacks.History at 0x247029c2fd0>

In [4]:
from sklearn.metrics import f1_score, confusion_matrix
df1_test=pd.read_csv("./datasets/testData.tsv",header=0, delimiter="\t", quoting=3)
df1_test["sentiment"] = df1_test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)

def model_performance_test(tested_model, df):
    df['review'] = df.review.apply(lambda x: clean_text(x))
    y_test = df["sentiment"]
    list_sentences_test = df["review"]
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
    prediction = tested_model.predict(X_te)
    y_pred = (prediction > 0.5)
    print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
    print('Confusion matrix:')
    print(confusion_matrix(y_pred, y_test))
    return f1_score(y_pred, y_test), confusion_matrix(y_pred, y_test)

model_performance_test(model, df2_test)

F1-score: 0.8327383543752722
Confusion matrix:
[[11594  2936]
 [  906  9564]]


(0.8327383543752722,
 array([[11594,  2936],
        [  906,  9564]], dtype=int64))

In [5]:
import gensim.downloader as api
similar_word_generator = api.load('glove-twitter-25')

In [6]:
import numpy as np
def word_existence_oracle(sentence, word):
    if word in sentence:
        return True
    else:
        return False

def check_existence_in_train(word, df):
    result = 0
    for sentence in df['review']:
        result = word_existence_oracle(sentence, word)
    return result

def pick_similar_word(word, df):
    for similar in similar_word_generator.most_similar(word, topn=10):
        if not check_existence_in_train(similar[0], df):
            return similar[0]
    return None

def compute_effect_of_word(word, df, df_train, tested_model):
    if pick_similar_word(word, df_train) is None:
        return None
    else:
        similar = pick_similar_word(word, df_train)
    df_chosen = np.array(df['review'][[word in i for i in df['review']]])
    prediction1 = model.predict(pad_sequences(tokenizer.texts_to_sequences(df_chosen), maxlen=maxlen))
    for i, sentence in enumerate(df_chosen):
        df_chosen[i] = sentence.replace(word, similar)
    prediction2 = model.predict(pad_sequences(tokenizer.texts_to_sequences(df_chosen), maxlen=maxlen))
    return prediction1 - prediction2

movie_difference = compute_effect_of_word('movie', df2_test, df2_train, model)
shot_difference = compute_effect_of_word('shot', df2_test, df2_train, model)
contain_difference = compute_effect_of_word('contain', df2_test, df2_train, model)
us_difference = compute_effect_of_word('us', df2_test, df2_train, model)
it_difference = compute_effect_of_word('it', df2_test, df2_train, model)
there_difference = compute_effect_of_word('there', df2_test, df2_train, model)
be_difference = compute_effect_of_word('be', df2_test, df2_train, model)
with_difference = compute_effect_of_word('with', df2_test, df2_train, model)
house_difference = compute_effect_of_word('house', df2_test, df2_train, model)

print('mean, var of \'contain\': ', np.mean(contain_difference), np.var(contain_difference))
print('mean, var of \'shot\': ', np.mean(shot_difference), np.var(shot_difference))
print('mean, var of \'movie\': ', np.mean(movie_difference), np.var(movie_difference))
print('mean, var of \'us\': ', np.mean(us_difference), np.var(us_difference))
print('mean, var of \'it\': ', np.mean(it_difference), np.var(it_difference))
print('mean, var of \'there\': ', np.mean(there_difference), np.var(there_difference))
print('mean, var of \'be\': ', np.mean(be_difference), np.var(be_difference))
print('mean, var of \'with\': ', np.mean(with_difference), np.var(with_difference))
print('mean, var of \'house\': ', np.mean(house_difference), np.var(house_difference))

mean, var of 'contain':  -0.00038689174 0.00010324158
mean, var of 'shot':  0.0011186124 0.00010881303
mean, var of 'movie':  -0.04385187 0.008850057
mean, var of 'us':  0.00069197395 0.0035085266
mean, var of 'it':  0.0041775773 0.0048676785
mean, var of 'there':  -0.008323542 0.001021028
mean, var of 'be':  0.010613689 0.0045949183
mean, var of 'with':  0.0063804802 0.00035058393
mean, var of 'house':  -0.00023695549 0.00013440657


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

words_with_no_meaning = ['it', 'this', 'are', 'is', 'was', 'will', 'that', 'my', 'there', 'be', 'with', 'in',
                         'out', 'on', 'under', 'how', 'what', 'why', 'may', 'have', 'where', 'he', 'she', 'do',
                         'when', 'were', 'these', 'those', 'can', 'could', 'has', 'had', 'them', 'would', 'which']

nouns_can_be_removed = ['movie', 'house', 'film', 'car', 'tree']

biased_words = words_with_no_meaning

def word_existence_list(word, df):
    tmp_list = np.zeros(len(df['review']))
    for count, sentence in enumerate(df['review']):
        tmp_list[count] = float(word in sentence)
    return tmp_list

biased_features_with_no_meaning = pd.DataFrame()
biased_features_with_no_meaning_test = pd.DataFrame()
for words in words_with_no_meaning:
    biased_features_with_no_meaning[words] = word_existence_list(words, df2_train)
    biased_features_with_no_meaning_test[words] = word_existence_list(words, df2_test)
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(biased_features_with_no_meaning, df2_train['sentiment'])
rf_prediction_without_probability = clf.predict(biased_features_with_no_meaning)
rf_prediction_without_probability_test = clf.predict(biased_features_with_no_meaning_test)
print('Train accuracy is ',
      np.sum([y==df2_train['sentiment'][count]
              for count, y in enumerate(rf_prediction_without_probability)])/len(rf_prediction_without_probability))
print('Test accuracy is ',
      np.sum([y==df2_test['sentiment'][count]
              for count, y in enumerate(rf_prediction_without_probability_test)])/len(rf_prediction_without_probability_test))
rf_prediction_with_probability = cross_val_predict(clf, biased_features_with_no_meaning, df2_train['sentiment'],
                                                       method='predict_proba', verbose=3, n_jobs=1)

propensity = np.array([rf_prediction_with_probability[i, y[i]] for i in range(len(rf_prediction_with_probability))])
print(np.mean(np.log(propensity)))
np.save('propensity.npy', propensity)

# propensity = np.load("propensity.npy")
prob_1_l = np.array([(propensity[i] if y[i] == 1 else (1-propensity[i]))
          for i in range(len(y))])
prob_0_l = 1 - prob_1_l


def calculate_weight_fraction(prob_1):
    prob_0 = 1 - prob_1
    w1 = 1 / (prob_0 * prob_1_l / (prob_0 * prob_1_l + prob_1 * prob_0_l))
    w0 = 1 / (prob_1 * prob_0_l / (prob_0 * prob_1_l + prob_1 * prob_0_l))
    return sum(w1[i] for i in range(len(y)) if y[i] == 1) / sum(w0[i] for i in range(len(y)) if y[i] == 0)


prior_fraction = np.sum(y) / (len(y) - np.sum(y))
l, r = 0, 1
thr = 0.00000000001
step = 100
# while l + thr < r:
for _ in range(step):
    m1 = l + (r- l) / 2
    if calculate_weight_fraction(m1) < prior_fraction:
        l = m1
    else:
        r = m1

m0 = 1 - m1
w1 = 1 / (m0 * prob_1_l / (m0 * prob_1_l + m1 * prob_0_l))
w0 = 1 / (m1 * prob_0_l / (m0 * prob_1_l + m1 * prob_0_l))
weight_for_training_set = np.array([(w1[i] if y[i] == 1 else w0[i]) for i in range(len(y))])
weight_for_training_set = weight_for_training_set / np.mean(weight_for_training_set)

Train accuracy is  0.6118
Test accuracy is  0.6058
-0.6676435663774646


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.6s finished


In [23]:
practical_model = Sequential()
practical_model.add(Embedding(max_features, embed_size))
practical_model.add(Bidirectional(LSTM(32, return_sequences = True)))
practical_model.add(GlobalMaxPool1D())
practical_model.add(Dense(20, activation="relu"))
practical_model.add(Dropout(0.05))
practical_model.add(Dense(1, activation="sigmoid"))
practical_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

practical_model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2, sample_weight=weight_for_training_set)

Train on 20000 samples, validate on 5000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.callbacks.History at 0x24749af2748>

In [27]:

model_performance_test(model, df2_train), model_performance_test(practical_model, df2_train)
model_performance_test(model, df2_test), model_performance_test(practical_model, df2_test)

F1-score: 0.9380404065966509
Confusion matrix:
[[12411  1380]
 [   89 11120]]
F1-score: 0.9339925625422583
Confusion matrix:
[[12387  1449]
 [  113 11051]]
F1-score: 0.8203021354902212
Confusion matrix:
[[11217  2916]
 [ 1283  9584]]
F1-score: 0.8208572161861579
Confusion matrix:
[[11217  2905]
 [ 1283  9595]]


((0.8203021354902212,
  array([[11217,  2916],
         [ 1283,  9584]], dtype=int64)),
 (0.8208572161861579,
  array([[11217,  2905],
         [ 1283,  9595]], dtype=int64)))