In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
#from make_pairwise_gold_metric_scores import compute_metrics 

pd.set_option('max_colwidth',5000)

In [8]:
# Load word2vec model for this specific course
w2v_matrix = pickle.load( open( "Models/w2v_matrix.p", "rb" ) )
vocab = np.array(pickle.load( open( "Models/vocab_w2v.p", "rb" ) ) )

In [22]:
w2v_matrix

IndexError: invalid index to scalar variable.

In [7]:
# Verify word2vec model works well (the following example should have cosine similarity near 1)

# def cosinesimilarity(u, v):
#   # u - embedding (vector)
#   # v - embedding (vector)
#   return np.dot(u,v) / (np.sqrt(np.dot(u, u)) * np.sqrt(np.dot(v,v)))
# np.dot(w2v_matrix[np.where(vocab == "queen")[0][0]], w2v_matrix[np.where(vocab == "king")[0][0]] - w2v_matrix[np.where(vocab == "man")[0][0]] + w2v_matrix[np.where(vocab == "woman")[0][0]] )

In [9]:
test_comments = [
  "Jesus is great. I love Jesus and the holy book. I think God is amazing. I agree with religion",
  "Jesus is great. I love Jesus and the holy book. I think God is amazing.",
  "Jesus is the best. I love Jesus and the holy book. I think God is amazing. I agree with religion",
  "Jesus is awesome. I love Jesus and the holy book. I think God is amazing. I agree with religion",
  "Christ is great. I love Christ and the holy book. I think God is amazing. I agree with religion",
  "Christ is great. I love Christ and the holy book. I agree with religion",
  "Donald Trump is a president with tons of electrical equipment and lightbulbs and random things.",
]

In [9]:
def embedding_tfidf(embedding_matrix, embedding_vocab, gold_data, gold_matrix, test_name):
  # Get tfidf counts for each comment as a matrix C with shape (# comments, size of w2v (embedding) vocab)
  vec = TfidfVectorizer(vocabulary=embedding_vocab, stop_words='english')
  C = vec.fit_transform(gold_data.body.values)

  # Compute tfidf bag of words for each comment as matrix A with shape (#comments, embedding dimension)
  A = C.dot(embedding_matrix)
  A = normalize(A, norm='l2')

  # Verify that each row of A is normalized (unit vector l2-norm)
  assert(abs(sum(np.sum(np.abs(A)**2,axis=-1)**(1./2)) / len(A) - 1.0) < 0.0001) # Should be close to 1.0

  # We compute pairwise cosine similarity with dot product since A is normalized.
  pairwise_cosine_similarity = np.dot(A, A.transpose())
  pairwise_cosine_similarity = MinMaxScaler().fit_transform(pairwise_cosine_similarity)

  # Compute avg consine similarity for same cluster comments and different cluster topics and subtract.
  same_cluster_avg_score = np.multiply(pairwise_cosine_similarity, gold_matrix).values.sum() / gold_matrix.values.sum()
  diff_cluster_avg_score = np.multiply(pairwise_cosine_similarity, 1-gold_matrix).values.sum() / (1-gold_matrix).values.sum()
  print(test_name, "score:", same_cluster_avg_score, "-", diff_cluster_avg_score, "=", same_cluster_avg_score - diff_cluster_avg_score)
  
  return pairwise_cosine_similarity

In [10]:
# Load gold train data
gold_matrix_train = pd.read_csv('gold_matrix_train_HarvardX__HDS_3221_2X__1T2016.csv.gz', compression='gzip')
df_gold_train = pd.read_csv('gold_data_train_HarvardX__HDS_3221_2X__1T2016.csv.gz', compression='gzip')
pairwise_cosine_similarity_train = embedding_tfidf(w2v_matrix, vocab, df_gold_train, gold_matrix_train, "Train")

Train score: 0.567915692812 - 0.513427819284 = 0.0544878735283


In [11]:
# Load gold data
gold_matrix_test = pd.read_csv('gold_matrix_test_HarvardX__HDS_3221_2X__1T2016.csv.gz', compression='gzip')
df_gold_test = pd.read_csv('gold_data_test_HarvardX__HDS_3221_2X__1T2016.csv.gz', compression='gzip')
pairwise_cosine_similarity_test =embedding_tfidf(w2v_matrix, vocab, df_gold_test, gold_matrix_test, "Test")

Test score: 0.585525126153 - 0.525324585547 = 0.0602005406061


In [12]:
metrics = compute_metrics(pairwise_cosine_similarity_train, pairwise_cosine_similarity_test, gold_matrix_train, df_gold_train, gold_matrix_test, df_gold_test)
pretty_metrics = pd.DataFrame(pd.Series(metrics), columns = ["Score"])
pretty_metrics

Median Quantile (Rank) Difference score: 0.671363553268 - 0.466419832811 = 0.204943720457
Pairwise Binary Logistic Regression Accuracy score: 0.815278216339

The next test uses parameter optimization over a random forest
classifier's parameters and may take 30s to 2 min to run.

Pairwise Binary Random Forest Accuracy score: 0.816372705668


Unnamed: 0,Score
logreg_acc_pairwise_binary,0.815278
median_quantile_diff,0.204944
random_forest_acc_pairwise_binary,0.816373


In [13]:
# Switching train and test

metrics = compute_metrics(pairwise_cosine_similarity_test, pairwise_cosine_similarity_train, gold_matrix_test, df_gold_test, gold_matrix_train, df_gold_train)
pretty_metrics = pd.DataFrame(pd.Series(metrics), columns = ["Score"])
pretty_metrics

Median Quantile (Rank) Difference score: 0.638136071736 - 0.468070575925 = 0.170065495812
Pairwise Binary Logistic Regression Accuracy score: 0.787785281487

The next test uses parameter optimization over a random forest
classifier's parameters and may take 30s to 2 min to run.

Pairwise Binary Random Forest Accuracy score: 0.779643632332


Unnamed: 0,Score
logreg_acc_pairwise_binary,0.787785
median_quantile_diff,0.170065
random_forest_acc_pairwise_binary,0.779644
