In [407]:
import sklearn, pandas as pd, numpy as np, seaborn as sns
from sklearn.decomposition import PCA, FastICA, KernelPCA
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from scipy import stats
from math import ceil
import time
import scipy.stats as stats
import numpy as np

In [725]:
#read the text file
file = open("yelp/train.txt", "r", encoding = "utf8")
file_data = file.read()
lines = file_data.splitlines()
file.close()

data = []

#only use senteces that contain the keyword
keyword = "pizza"
for s in lines:
    if keyword in s:
        data.append(s)

#appending a random weird sentence to see if it pops to the top of the sentence rankings        
data.append("Colorless green ideas sleep furiously .")
#data.append("Écriteau lumineux avec écrit pizza Kebab .")
#data.append("Have a nice day !")
#data = formatting(data)

#generates embedding for the sentences
emb = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
#emb = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embed = emb.encode(data, show_progress_bar = True)

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

In [705]:
print(embed.shape)
print(data[-1])

(2146, 384)
Colorless green ideas sleep furiously .


In [706]:
def dim_reduce_kpca(num_comps, kernel = "linear", return_model=False):
    kpca_standardized_embed = StandardScaler().fit_transform(embed)
    kpca= KernelPCA(n_components=num_comps, kernel=kernel, random_state=2023, eigen_solver="arpack", copy_X=False)
    kpca_model = kpca.fit(kpca_standardized_embed)
    embed_red_kpca = kpca_model.transform(kpca_standardized_embed)
    if return_model:
        return [embed_red_kpca, kpca_model]
    return embed_red_kpca
def dim_reduce_PCA(num_comps, print_data=False, return_model=False):
    pca_standardized_embed = StandardScaler().fit_transform(embed) #added because this is used in paper 
                                                               #(they say that PCA assumes a Gaussian Distribution 
                                                               #and the features must be normalized)
    pca = PCA(n_components=num_comps, random_state = 2023)
    model = pca.fit(pca_standardized_embed)
    embed_red_pca = model.transform(pca_standardized_embed)
    if print_data:
        orig = pd.DataFrame(pca_standardized_embed)
        reduced = pd.DataFrame(embed_red_pca)
        print("original: \n")
        print(orig)
        print("\n dimensionally reduced: \n")
        print(reduced)
    if return_model:
        return [embed_red_pca, model]
    return embed_red_pca


def dim_reduce_ICA(num_comps, print_data=False, return_model=False):
    ica = FastICA(n_components = num_comps,
                  whiten = 'arbitrary-variance',
                  max_iter = 300, 
                  random_state = 2023) #set random state to 2023 to have reproducible results
    model = ica.fit(embed)
    embed_red_ica = model.transform(embed)
    if print_data:
        orig = pd.DataFrame(embed)
        reduced = pd.DataFrame(embed_red_ica)
        print("original: \n")
        print(orig)
        print("\n dimensionally reduced: \n")
        print(reduced)
    if return_model:
        return [embed_red_ica, model]
    return embed_red_ica


In [707]:
#embed_red_kpca_cos, kpca_model_cos = dim_reduce_kpca(num_comps_pca, kernel="cosine", return_model=True)

In [708]:
def num_bins_Scotts(data):
    num_bins = []
    for i in range(data.shape[1]):
        s = np.std(data[:,i])
        bin_width = 3.49 * s * (data.shape[0]**(-1/3))
        num_bins.append(ceil((np.max(data[:,i])-np.min(data[:,i]))/bin_width))
    return num_bins

In [709]:
def num_bins_Freedman_Diaconis(data):
    num_bins = []
    for i in range(data.shape[1]):
        iqr = stats.iqr(data[:,i])
        h = 2 * iqr * (len(data[:,i]) ** (-1/3))
        num_bins.append(ceil((np.max(data[:,i])-np.min(data[:,i]))/h))
    return num_bins

In [710]:
def histogram(data, bins, frequency = False):
    all_hist = []
    all_bins = []
    for i in range(data.shape[1]):
        hist, bin_edges = np.histogram(data[:,i], bins = bins[i], density = not frequency)
        all_hist.append(hist)
        all_bins.append(bin_edges)
    return all_hist, all_bins

In [711]:
#hists_kpca_cos, bins_kpca_cos = histogram(data_reduce_kpca_cos, num_bins_Freedman_Diaconis(data_reduce_kpca_cos))

In [712]:
def find_bin(bins, value):
    if value == bins[0]:
        return 0
    i = 1
    while value > bins[i]:
        i+=1
    return i-1

In [713]:
def probability(histograms, bins, data):
    probs = []
    for col in range(data.shape[1]):
        col_prob = []
        width = np.diff(np.array(bins[col]))[0]
        for row in range(data.shape[0]):
            idx = find_bin(bins[col], data[row][col])
            prob = histograms[col][idx] * width
            col_prob.append(prob)
        probs.append(col_prob)
    return np.array(probs).T

In [714]:
#probs_kpca_cos = probability(hists_kpca_cos, bins_kpca_cos, data_reduce_kpca_cos)

In [715]:
def sentence_surprisals(probs):
    surprisals = []
    for i in range(probs.shape[0]):
        surprisals.append(-1*sum(np.log2(probs[i])))
    return np.array(surprisals)

In [716]:
#sen_surprisals_kpca_cos = sentence_surprisals(probs_kpca_cos)

In [70]:
start_all = time.time() #Can make this much better for PCA and KPCA
weird_sen_rankings = []
for i in range(1, 40):
    embed_red, model = dim_reduce_ICA(i, return_model=True)
    hists, bins = histogram(embed_red, num_bins_Freedman_Diaconis(embed_red))
    probs = probability(hists, bins, embed_red)
    sen_surprisals = sentence_surprisals(probs)
    rank_surprisal = np.flip(np.argsort(sen_surprisals))
    #print(i)
    for j in rank_surprisal:
        if data[j] == "Colorless green ideas sleep furiously .":
            weird_sen_rankings.append(np.where(rank_surprisal == j)[0][0])
            break
end_all = time.time()
print(end_all-start_all)



KeyboardInterrupt: 

In [17]:
start = time.time()
embed_red, model = dim_reduce_kpca(347, kernel = "cosine", return_model = True)
end = time.time()
time_total = print(end-start)
hists, bins = histogram(embed_red, num_bins_Freedman_Diaconis(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.flip(np.argsort(sen_surprisals))
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
print(pd.DataFrame(sen_rankings))
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
print(sen_rankings[10])

17.81806707382202
                                                       0
0                          the food was primarily bleh .
1                        sorry posted wrong pic for food
2      great food with bojangles , krispy kreme and j...
3                                  : - ( but good food .
4      but the food is gggggggrrrrreeeeeeeaaaaaaatttt...
...                                                  ...
10924                                the food was good .
10925                                the food was good .
10926                                the food was good .
10927                                the food was good .
10928                                the food was good .

[10929 rows x 1 columns]
10
Colorless green ideas sleep furiously .


In [69]:
weird_sen_rankings = np.array(weird_sen_rankings)
print(np.where(weird_sen_rankings == 0)[0]) #for rbf have to look from bottom up?
print(np.argmin(weird_sen_rankings))
print(weird_sen_rankings)

[0]
0
[ 0  1  3  1  2  1  1  2  3  3  6  9  7 11 13 14 14 15 17 22 24 25 28 30
 32 34 33 34 34 34 34 35 35 35 35 36 36 37 38]


In [726]:
start = time.time()
embed_red, model = dim_reduce_PCA(75, return_model = True)
end = time.time()
print(end-start)
# print(np.mean(num_bins_Freedman_Diaconis(embed_red)))
# print(np.mean(num_bins_Scotts(embed_red)))

0.0789179801940918


In [727]:
hists, bins = histogram(embed_red, num_bins_Scotts(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.flip(np.argsort(sen_surprisals))
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
with open("PCA_Sentence_Rankings", 'w') as f:
    for i in sen_rankings:
        f.write(i + "\n")
    f.close()

In [728]:
pca_tau_ranking = []
for i in data:
    pca_tau_ranking.append(np.where(np.array(sen_rankings)==i)[0][0])

In [730]:
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
#print(np.where(np.array(sen_rankings) == "decidi comer uma pizza , j que eu tava com pouco dinheiro .")[0][0])
#print(np.where(np.array(sen_rankings) == "Écriteau lumineux avec écrit pizza Kebab .")[0][0])
#print(np.where(np.array(sen_rankings) == "Have a nice day !")[0][0])

0


In [731]:
start = time.time()
embed_red, model = dim_reduce_ICA(75, return_model = True)
end = time.time()
print(end-start)

0.4031641483306885


In [734]:
hists, bins = histogram(embed_red, num_bins_Scotts(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.flip(np.argsort(sen_surprisals))
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
with open("ICA_Sentence Rankings", 'w') as f:
    for i in sen_rankings:
        f.write(i + "\n")
    f.close()

In [735]:
ica_tau_ranking = []
for i in data:
    ica_tau_ranking.append(np.where(np.array(sen_rankings)==i)[0][0])

In [736]:
tau, p_value = stats.kendalltau(pca_tau_ranking, ica_tau_ranking)
print(tau, p_value)

0.544957682516551 0.0


In [737]:
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
#print(np.where(np.array(sen_rankings) == "decidi comer uma pizza , j que eu tava com pouco dinheiro .")[0][0])
#print(np.where(np.array(sen_rankings) == "Écriteau lumineux avec écrit pizza Kebab .")[0][0])
#print(np.where(np.array(sen_rankings) == "Have a nice day !")[0][0])

0


In [738]:
start = time.time()
embed_red, model = dim_reduce_kpca(75, kernel="poly", return_model = True)
end = time.time()
print(end-start)

0.2610464096069336


In [739]:
hists, bins = histogram(embed_red, num_bins_Scotts(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.flip(np.argsort(sen_surprisals))
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
with open("Polynomial KPCA Sentence Rankings", 'w') as f:
    for i in sen_rankings:
        f.write(i + "\n")
    f.close()

In [740]:
poly_tau_ranking = []
for i in data:
    poly_tau_ranking.append(np.where(np.array(sen_rankings)==i)[0][0])

In [741]:
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
#print(np.where(np.array(sen_rankings) == "decidi comer uma pizza , j que eu tava com pouco dinheiro .")[0][0])
#print(np.where(np.array(sen_rankings) == "Écriteau lumineux avec écrit pizza Kebab .")[0][0])
#print(np.where(np.array(sen_rankings) == "Have a nice day !")[0][0])

1


In [742]:
tau, p_value = stats.kendalltau(pca_tau_ranking, poly_tau_ranking)
print(tau, p_value)

0.6154716143297657 0.0


In [686]:
start = time.time()
embed_red, model = dim_reduce_kpca(8, kernel="rbf", return_model = True)
end = time.time()
print(end-start)

0.005132198333740234


In [687]:
hists, bins = histogram(embed_red, num_bins_Freedman_Diaconis(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.argsort(sen_surprisals)
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
with open("Gaussian RBF KPCA Sentence Rankings", 'w') as f:
    for i in sen_rankings:
        f.write(i + "\n")
    f.close()

In [688]:
#print(sen_rankings)
rbf_tau_ranking = []
for i in data:
    rbf_tau_ranking.append(np.where(np.array(sen_rankings)==i)[0][0])

In [690]:
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
#print(np.where(np.array(sen_rankings) == "decidi comer uma pizza , j que eu tava com pouco dinheiro .")[0][0])
#print(np.where(np.array(sen_rankings) == "Écriteau lumineux avec écrit pizza Kebab .")[0][0])
print(np.where(np.array(sen_rankings) == "Have a nice day !")[0][0])

35
26


In [691]:
tau, p_value = stats.kendalltau(pca_tau_ranking, rbf_tau_ranking)
print(tau, p_value)

-0.2682926829268293 0.013472587734971984


In [692]:
start = time.time()
embed_red, model = dim_reduce_kpca(8, kernel="sigmoid", return_model = True)
end = time.time()
print(end-start)

0.0039980411529541016


In [693]:
hists, bins = histogram(embed_red, num_bins_Freedman_Diaconis(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.flip(np.argsort(sen_surprisals))
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
with open("Sigmoid KPCA Sentence Rankings", 'w') as f:
    for i in sen_rankings:
        f.write(i + "\n")
    f.close()

In [694]:
sig_tau_ranking = []
for i in data:
    sig_tau_ranking.append(np.where(np.array(sen_rankings)==i)[0][0])

In [695]:
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
#print(np.where(np.array(sen_rankings) == "decidi comer uma pizza , j que eu tava com pouco dinheiro .")[0][0])
#print(np.where(np.array(sen_rankings) == "Écriteau lumineux avec écrit pizza Kebab .")[0][0])
print(np.where(np.array(sen_rankings) == "Have a nice day !")[0][0])

5
6


In [696]:
tau, p_value = stats.kendalltau(pca_tau_ranking, sig_tau_ranking)
print(tau, p_value)

0.3024390243902439 0.0053441835203513555


In [697]:
start = time.time()
embed_red, model = dim_reduce_kpca(8, kernel="cosine", return_model = True)
end = time.time()
print(end-start)

0.003999233245849609


In [698]:
hists, bins = histogram(embed_red, num_bins_Freedman_Diaconis(embed_red))
probs = probability(hists, bins, embed_red)
sen_surprisals = sentence_surprisals(probs)
rank_surprisal = np.flip(np.argsort(sen_surprisals))
sen_rankings = []
for i in rank_surprisal:
    sen_rankings.append(data[i])
with open("Cosine KPCA Sentence Rankings", 'w') as f:
    for i in sen_rankings:
        f.write(i + "\n")
    f.close()

In [699]:
cos_tau_ranking = []
for i in data:
    cos_tau_ranking.append(np.where(np.array(sen_rankings)==i)[0][0])

In [700]:
print(np.where(np.array(sen_rankings) == "Colorless green ideas sleep furiously .")[0][0])
#print(np.where(np.array(sen_rankings) == "decidi comer uma pizza , j que eu tava com pouco dinheiro .")[0][0])
#print(np.where(np.array(sen_rankings) == "Écriteau lumineux avec écrit pizza Kebab .")[0][0])
print(np.where(np.array(sen_rankings) == "Have a nice day !")[0][0])

18
21


In [525]:
tau, p_value = stats.kendalltau(pca_tau_ranking, cos_tau_ranking)
print(tau, p_value)

0.41243737596624896 1.671373235731683e-180


In [456]:
print(sen_rankings)

['we decided to split a pizza .', 'twin a had ordered a special salad to go with his pizza .', "but fiori 's is pizza made in heaven .", 'we started with the white pizza and boy did it hit the spot .', "nick 's is standard pizza .", 'one of the best slices of pizza in charlotte .', 'i split the bbq pizza with a friend and we enjoyed it .', 'just a good straight forward mom and pop pizza pie .', 'the other half got the prosciutto and arugula pizza .', 'call me weird , but i love canned mushroom on my pizza .', "oh those pepperoni rolls are better than mancini 's pizza rolls .", "just had my first pizza from fiori 's .", 'my husband loves china sea , whereas i would rather order pizza : )', 'had this pizza been cooked , i feel like it would be subpar .', 'the pizza slices are as long as a newborn baby .', "tony 's is the best real ny pizza in charlotte , period .", 'sausage pizza was very good from brother .', 'pizza bagels are a must !', 'first , the medium pizza is huge .', 'hands down

In [325]:
f = open('IDF Sentence_Rankings', 'r')
idf_ranking = f.readlines()
for i in range(len(idf_ranking)):
    idf_ranking[i] = idf_ranking[i].strip()

In [326]:
idf_tau_ranking = []
for i in data:
    idf_tau_ranking.append(np.where(np.array(idf_ranking)==i)[0][0])

In [327]:
tau, p_value = stats.kendalltau(idf_tau_ranking, pca_tau_ranking)
print(tau, p_value)

0.3338353863716508 3.0830902758990844e-33


In [328]:
tau, p_value = stats.kendalltau(idf_tau_ranking, ica_tau_ranking)
print(tau, p_value)

0.41913658831053174 2.16109637248936e-51


In [329]:
tau, p_value = stats.kendalltau(idf_tau_ranking, poly_tau_ranking)
print(tau, p_value)

0.11429476490171117 3.914972066328048e-05


In [239]:
tau, p_value = stats.kendalltau(idf_tau_ranking, rbf_tau_ranking)
print(tau, p_value)

0.2313307967609387 4.7797211898347926e-58


In [240]:
tau, p_value = stats.kendalltau(idf_tau_ranking, sig_tau_ranking)
print(tau, p_value)

0.2884466904855942 3.228914871721298e-89


In [241]:
tau, p_value = stats.kendalltau(idf_tau_ranking, cos_tau_ranking)
print(tau, p_value)

0.21610093455182658 6.943093560983525e-51


In [245]:
f = open('GPT-2-for-Psycholinguistic-Applications/GPT-2 Sentence_Rankings', 'r')
gpt_ranking = f.readlines()
for i in range(len(gpt_ranking)):
    gpt_ranking[i] = gpt_ranking[i].strip()

In [246]:
gpt_tau_ranking = []
for i in data:
    gpt_tau_ranking.append(np.where(np.array(gpt_ranking)==i)[0][0])

In [247]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, pca_tau_ranking)
print(tau, p_value)

0.13538422901972644 5.47638083605189e-21


In [131]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, ica_tau_ranking)
print(tau, p_value)

-0.05384615384615385 0.624598563467762


In [132]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, poly_tau_ranking)
print(tau, p_value)

-0.05128205128205128 0.6411858797379664


In [135]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, rbf_tau_ranking)
print(tau, p_value)

0.06666666666666667 0.5446118423948019


In [134]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, sig_tau_ranking)
print(tau, p_value)

-0.08461538461538463 0.44191214062063056


In [113]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, cos_tau_ranking)
print(tau, p_value)

0.1794871794871795 0.10285976142339351


In [114]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, idf_tau_ranking)
print(tau, p_value)

0.5641025641025641 2.952482596295399e-07


In [115]:
f = open('Manual Sentence Rankings.txt', 'r')
man_ranking = f.readlines()
for i in range(len(man_ranking)):
    man_ranking[i] = man_ranking[i].strip()

In [116]:
man_tau_ranking = []
for i in data:
    man_tau_ranking.append(np.where(np.array(man_ranking)==i)[0][0])

In [117]:
tau, p_value = stats.kendalltau(pca_tau_ranking, man_tau_ranking)
print(tau, p_value)

0.08717948717948719 0.4282030561707131


In [118]:
tau, p_value = stats.kendalltau(ica_tau_ranking, man_tau_ranking)
print(tau, p_value)

-0.06666666666666667 0.5446118423948019


In [119]:
tau, p_value = stats.kendalltau(poly_tau_ranking, man_tau_ranking)
print(tau, p_value)

-0.08974358974358974 0.4147447293562837


In [120]:
tau, p_value = stats.kendalltau(rbf_tau_ranking, man_tau_ranking)
print(tau, p_value)

0.06923076923076923 0.5292470190010741


In [121]:
tau, p_value = stats.kendalltau(sig_tau_ranking, man_tau_ranking)
print(tau, p_value)

0.1435897435897436 0.19192149639323952


In [122]:
tau, p_value = stats.kendalltau(cos_tau_ranking, man_tau_ranking)
print(tau, p_value)

0.13589743589743591 0.21682690771792557


In [123]:
tau, p_value = stats.kendalltau(idf_tau_ranking, man_tau_ranking)
print(tau, p_value)

0.38717948717948725 0.0004338099435909231


In [124]:
tau, p_value = stats.kendalltau(gpt_tau_ranking, man_tau_ranking)
print(tau, p_value)

0.5512820512820513 5.444563244176376e-07
