In [None]:
import warnings
import pandas as pd
from tqdm import trange, tqdm
# Modules for data manipulation
import numpy as np
import pandas as pd
import re

# Modules for visualization
import matplotlib.pyplot as plt
import seaborn as sb

# Tools for preprocessing input data
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Tools for creating ngrams and vectorizing input data
from gensim.models import Word2Vec, Phrases

# Tools for building a model
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences

# Tools for assessing the quality of model prediction
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    return text

df1 = pd.read_csv('./datasets/labeledTrainData.tsv', delimiter="\t")
df1 = df1.drop(['id'], axis=1)
df2 = pd.read_csv('./datasets/imdb_master.csv',encoding="latin-1")
df2['review'] = df2.review.apply(lambda x: clean_text(x))
df1['review'] = df1.review.apply(lambda x: clean_text(x))
df2 = df2[df2.label != 'unsup']
df2['label'].replace('neg', 0, inplace=True)
df2['label'].replace('pos', 1, inplace=True)
df2 = df2.drop(columns=[df2.keys()[0], df2.keys()[4]])
df2 = df2.rename(columns={'label':'sentiment'})
df2_train, df2_test = df2[df2['type']=='train'].drop(columns=['type']), df2[df2['type']=='test'].drop(columns=['type'])
df2_train, df2_test = df2_train.reset_index(drop=True), df2_test.reset_index(drop=True)
df1_test=pd.read_csv("./datasets/testData.tsv",header=0, delimiter="\t", quoting=3)
df1_test["sentiment"] = df1_test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)
df1_test = df1_test.drop(['id'], axis=1)
df1_test['review'] = df1_test.review.apply(lambda x: clean_text(x))
SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIG_SIZE = 16
LARGE_SIZE = 20
params = {
    'figure.figsize': (16, 8),
    'font.size': SMALL_SIZE,
    'xtick.labelsize': MEDIUM_SIZE,
    'ytick.labelsize': MEDIUM_SIZE,
    'legend.fontsize': BIG_SIZE,
    'figure.titlesize': LARGE_SIZE,
    'axes.titlesize': MEDIUM_SIZE,
    'axes.labelsize': BIG_SIZE
}
plt.rcParams.update(params)

In [None]:
df2_train['review_lenght'] = np.array(list(map(len, df2_train['review'])))
median = df2_train['review_lenght'].median()
mean = df2_train['review_lenght'].mean()
mode = df2_train['review_lenght'].mode()[0]
fig, ax = plt.subplots()
sb.distplot(df2_train['review_lenght'], bins=df2_train['review_lenght'].max(),
            hist_kws={"alpha": 0.9, "color": "blue"}, ax=ax,
            kde_kws={"color": "black", 'linewidth': 3})
ax.set_xlim(left=0, right=np.percentile(df2_train['review_lenght'], 95))
ax.set_xlabel('Words in review')
ymax = 0.014
plt.ylim(0, ymax)
ax.plot([mode, mode], [0, ymax], '--', label=f'mode = {mode:.2f}', linewidth=4)
ax.plot([mean, mean], [0, ymax], '--', label=f'mean = {mean:.2f}', linewidth=4)
ax.plot([median, median], [0, ymax], '--',
        label=f'median = {median:.2f}', linewidth=4)
ax.set_title('Words per review distribution', fontsize=20)
plt.legend()
plt.show()

In [None]:
import gensim
model_file = './datasets/GoogleNews-vectors-negative300.bin'
print("Loading word2vec model......")
wv_model = gensim.models.KeyedVectors.load_word2vec_format(model_file,binary=True)

In [None]:
from tqdm.contrib.concurrent import thread_map
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...', end='\r')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(thread_map(encode, data, max_workers=2))
    print('Vectorize sentences... (done)')
    return vectorized

In [None]:
maxlen = 200
pad_popcorn_test = pad_sequences(sequences=vectorize_data(df1_test['review'], vocab=wv_model.wv.vocab),maxlen=maxlen,padding='post')
pad_popcorn_train = pad_sequences(sequences=vectorize_data(df1['review'], vocab=wv_model.wv.vocab),maxlen=maxlen,padding='post')
pad_train = pad_sequences(sequences=vectorize_data(df2_train['review'], vocab=wv_model.wv.vocab),maxlen=maxlen,padding='post')
pad_test = pad_sequences(sequences=vectorize_data(df2_test['review'], vocab=wv_model.wv.vocab),maxlen=maxlen,padding='post')

np.save("./datasets/pad_train.npy", pad_train)
np.save("./datasets/pad_test.npy", pad_test)
np.save("./datasets/pad_popcorn_train.npy", pad_popcorn_train)
np.save("./datasets/pad_popcorn_test.npy", pad_popcorn_test)

In [None]:
pad_train = np.load("./datasets/pad_train.npy")
pad_test = np.load("./datasets/pad_test.npy")
pad_popcorn_train = np.load("./datasets/pad_popcorn_train.npy")
pad_popcorn_test = np.load("./datasets/pad_popcorn_test.npy")
pad_train = np.array([pad_train[i] for i in range(pad_train.shape[0])])
pad_test = np.array([pad_test[i] for i in range(pad_test.shape[0])])

In [None]:
maxlen = 200
def build_model(embedding_matrix: np.ndarray, input_length: int):
    model = Sequential()
    model.add(Embedding(
        input_dim = embedding_matrix.shape[0],
        output_dim = embedding_matrix.shape[1],
        input_length = input_length,
        weights = [embedding_matrix],
        trainable=False))
    model.add(Bidirectional(LSTM(128, recurrent_dropout=0.1)))
    model.add(Dropout(0.25))
    model.add(Dense(64))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    return model

model = build_model(
    embedding_matrix=wv_model.wv.vectors,
    input_length=maxlen)

model.compile(
    loss="binary_crossentropy",
    optimizer='adam',
    metrics=['accuracy'])

history = model.fit(
    x=pad_train,
    y=df2_train['sentiment'],
    validation_data=(pad_test, df2_test['sentiment']),
    batch_size=100,
    epochs=6)

In [None]:
import pickle
with open('./models/org_model.pickle', 'wb') as file:
    pickle.dump(model, file)
print('Done')

In [None]:
import pickle
with open('./models/org_model.pickle', 'rb') as file:
   model=pickle.load(file)

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

def model_performance_test(tested_model, pad, y):
    prediction = tested_model.predict(pad)
    y_pred = (prediction > 0.5)
    print('F1-score: {0}'.format(f1_score(y_pred, y)))
    print('Confusion matrix:')
    print(confusion_matrix(y_pred, y))
    print('Accuracy: {0}'.format(accuracy_score(y_pred, y)))
    return f1_score(y_pred, y), confusion_matrix(y_pred, y), accuracy_score(y_pred, y)

# model_performance_test(model, pad_test, df2_test['sentiment'])

In [None]:
def word_existence_oracle(sentence, word):
    return word in sentence

def check_existence_in_train(word, df):
    result = 0
    for sentence in df['review']:
        result += word_existence_oracle(sentence, word)
    return result != 0

def compute_effect_of_word(word, df, pad, tested_model):
    if not check_existence_in_train(word, df2_test):
        return None
    index_chosen = []
    for index, i in enumerate(df['review']):
        if word in i:
            index_chosen.append(index)
    keys = list(wv_model.wv.vocab.keys())
    embedding = keys.index(word)
    tmp_pad1 = []
    tmp_pad2 = []
    for i in index_chosen:
        tmp_pad1.append([0 if j == embedding else j for j in pad[i]])
        tmp_pad2.append(pad[i])
    tmp_pad1 = np.array(tmp_pad1)
    tmp_pad2 = np.array(tmp_pad2)
    prediction1 = tested_model.predict(tmp_pad2)
    prediction2 = tested_model.predict(tmp_pad1)
    return prediction1 - prediction2

def word_summary(word, df, pad, tested_model):
    difference = compute_effect_of_word(word, df, pad, tested_model)
    print('mean, var of \''+word+'\': ', np.mean(difference), np.var(difference))
    
for i in ['movie', 'car', 'us', 'man', 'review', 'house']:
    word_summary(i, df2_test, pad_test, model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

words_with_no_meaning = ['it', 'this', 'are', 'is', 'was', 'will', 'that', 'my', 'there', 'be', 'with', 'in',
                         'out', 'on', 'under', 'how', 'what', 'why', 'may', 'have', 'where', 'he', 'she', 'do',
                         'when', 'were', 'these', 'those', 'can', 'could', 'has', 'had', 'them', 'would', 'which']

nouns_can_be_removed = ['movie', 'house', 'film', 'car', 'tree']

biased_words = words_with_no_meaning
existence = [check_existence_in_train(word, df2_train) for word in biased_words]
biased_words = list(filter(None, [biased_words[i] if existence[i] else None for i in range(len(existence))]))

In [None]:
y = df2_train['sentiment']
def word_existence_list(word, df):
    tmp_list = np.zeros(len(df['review']))
    for count, sentence in enumerate(df['review']):
        tmp_list[count] = float(word in sentence)
    return tmp_list

biased_features_with_no_meaning = pd.DataFrame()
biased_features_with_no_meaning_test = pd.DataFrame()
for words in words_with_no_meaning:
    biased_features_with_no_meaning[words] = word_existence_list(words, df2_train)
    biased_features_with_no_meaning_test[words] = word_existence_list(words, df2_test)
clf = RandomForestClassifier(max_depth=6, random_state=0)
clf.fit(biased_features_with_no_meaning, df2_train['sentiment'])
rf_prediction_without_probability = clf.predict(biased_features_with_no_meaning)
rf_prediction_without_probability_test = clf.predict(biased_features_with_no_meaning_test)
print('Train accuracy is ',
      np.sum([y==df2_train['sentiment'][count]
              for count, y in enumerate(rf_prediction_without_probability)])/len(rf_prediction_without_probability))
print('Test accuracy is ',
      np.sum([y==df2_test['sentiment'][count]
              for count, y in enumerate(rf_prediction_without_probability_test)])/len(rf_prediction_without_probability_test))
rf_prediction_with_probability = cross_val_predict(clf, biased_features_with_no_meaning, df2_train['sentiment'],
                                                       method='predict_proba', verbose=3, n_jobs=1)

propensity = np.array([rf_prediction_with_probability[i, y[i]] for i in range(len(rf_prediction_with_probability))])
print(np.mean(np.log(propensity)))
np.save('propensity.npy', propensity)

prob_1_l = np.array([(propensity[i] if y[i] == 1 else (1-propensity[i]))
          for i in range(len(y))])
prob_0_l = 1 - prob_1_l


def calculate_weight_fraction(prob_1):
    prob_0 = 1 - prob_1
    w1 = 1 / (prob_0 * prob_1_l / (prob_0 * prob_1_l + prob_1 * prob_0_l))
    w0 = 1 / (prob_1 * prob_0_l / (prob_0 * prob_1_l + prob_1 * prob_0_l))
    return sum(w1[i] for i in range(len(y)) if y[i] == 1) / sum(w0[i] for i in range(len(y)) if y[i] == 0)


prior_fraction = np.sum(y) / (len(y) - np.sum(y))
l, r = 0, 1
thr = 0.00000000001
step = 100

for _ in range(step):
    m1 = l + (r- l) / 2
    if calculate_weight_fraction(m1) < prior_fraction:
        l = m1
    else:
        r = m1

m0 = 1 - m1
w1 = 1 / (m0 * prob_1_l / (m0 * prob_1_l + m1 * prob_0_l))
w0 = 1 / (m1 * prob_0_l / (m0 * prob_1_l + m1 * prob_0_l))
weight_for_training_set = np.array([(w1[i] if y[i] == 1 else w0[i]) for i in range(len(y))])
weight_for_training_set = weight_for_training_set / np.mean(weight_for_training_set)
print(pd.DataFrame(weight_for_training_set).describe())

In [None]:
new_model = build_model(
    embedding_matrix=wv_model.wv.vectors,
    input_length=maxlen)

new_model.compile(
    loss="binary_crossentropy",
    optimizer='adam',
    metrics=['accuracy'])

new_history = new_model.fit(
    x=pad_train,
    y=df2_train['sentiment'],
    sample_weight=weight_for_training_set,
    validation_data=(pad_test, df2_test['sentiment']),
    batch_size=100,
    epochs=6)

In [None]:
f1_score_org = []
accuracy_org = []
f1_score_new = []
accuracy_new = []
a, _, c = model_performance_test(model, pad_popcorn_test, df1_test['sentiment'])
f1_score_org.append(a)
accuracy_org.append(c)
a, _, c = model_performance_test(new_model, pad_popcorn_test, df1_test['sentiment'])
f1_score_new.append(a)
accuracy_new.append(c)
a, _, c = model_performance_test(model, pad_popcorn_train, df1['sentiment'])
f1_score_org.append(a)
accuracy_org.append(c)
a, _, c = model_performance_test(new_model, pad_popcorn_train, df1['sentiment'])
f1_score_new.append(a)
accuracy_new.append(c)
a, _, c = model_performance_test(model, pad_test, df2_test['sentiment'])
f1_score_org.append(a)
accuracy_org.append(c)
a, _, c = model_performance_test(new_model, pad_test, df2_test['sentiment'])
f1_score_new.append(a)
accuracy_new.append(c)

In [None]:
accuracy_data = pd.DataFrame({'Accuracy':accuracy_org+accuracy_new,
                              'Model':['Original Model' for i in range(3)]+['Debiased Model' for i in range(3)],
                              'Datasets':['Bag of Words Meets Bags of Popcorn (test)',
                                          'Bag of Words Meets Bags of Popcorn (train)',
                                          'IMDB Review Dataset (test)',
                                          'Bag of Words Meets Bags of Popcorn (test)',
                                          'Bag of Words Meets Bags of Popcorn (train)',
                                          'IMDB Review Dataset (test)']})
sb.barplot(x='Datasets', y='Accuracy', hue='Model', data=accuracy_data)
plt.show()


In [None]:
difference_dict = {}
new_difference_dict = {}
for word in tqdm(['movie', 'house', 'film', 'car', 'tree', 'which', 'would', 'could']):
    difference_dict[word] = np.mean(compute_effect_of_word(word, df2_test, pad_test, model))
    new_difference_dict[word] = np.mean(compute_effect_of_word(word, df2_test, pad_test, new_model))

print(difference_dict)
print(new_difference_dict)

In [None]:
validate_word_list = ['movie', 'house', 'film', 'car', 'tree', 'which', 'would', 'could']
difference_dataframe = pd.DataFrame({
    'Words': validate_word_list + validate_word_list,
    'Prediction Difference': [difference_dict[i] for i in validate_word_list] + [new_difference_dict[i] for i in validate_word_list],
    'Model': ['Original Model' for i in validate_word_list] + ['Debiased Model' for i in validate_word_list]
})
plt.figure(figsize=(15, 11))
sb.set(font_scale=2.4, style='white', )
sb.barplot(x='Words', y='Prediction Difference', hue='Model', data=difference_dataframe,
           palette=[(68/256,114/256,196/256), (237/256,125/256,49/256)]).set_title('The Effect of Words on Prediction')
plt.savefig('./images/difference_comparison.png')
plt.show()

In [None]:
# read data
reviews_df = pd.read_csv("./datasets/Hotel_Reviews.csv")
# append the positive and negative text reviews
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
# create the label
reviews_df["sentiment"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 9.75 else 0)
# select only relevant columns
reviews_df = reviews_df[["review", "sentiment"]]
reviews_df = reviews_df[np.array([(np.random.uniform(0, 50, 1)[0] < 0.5) for i in range(len(reviews_df))])]
reviews_df['review'] = reviews_df.review.apply(lambda x: clean_text(x))
reviews_df = reviews_df.reset_index()
reviews_df = reviews_df.drop(['index'], axis = 1)
pad_extended_test = pad_sequences(sequences=vectorize_data(reviews_df['review'], vocab=wv_model.wv.vocab),maxlen=maxlen,padding='post')

In [None]:
model_performance_test(model, pad_extended_test, reviews_df['sentiment'])
model_performance_test(new_model, pad_extended_test, reviews_df['sentiment'])

In [None]:
all_difference_dict = {}
for word in tqdm(words_with_no_meaning):
    try:
        all_difference_dict[word] = np.mean(compute_effect_of_word(word, df2_test, pad_test, model))
    except:
        print(word)
        pass

In [None]:
width = 0.2
plt.rcParams['figure.figsize'] = (16.0, 12.0)
plt.bar(range(len(all_difference_dict)), np.abs(np.array(list(all_difference_dict.values()))), width=2*width, label='Original Model',fc = 'blue')
plt.ylabel('Accuracy Difference (absolute value)')
plt.savefig('./images/org_difference.png')
