In [43]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import json
import re 
import random
import numpy as np


In [2]:
data = json.loads(open("Oppositional_thinking_analysis_dataset.json").read())

In [5]:
selected_instances = random.sample(data, 15)

In [7]:
def preprocessing(text:str, lem_tag: bool) -> int:

    # Lowercasing
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove decimals  
    text = re.sub(r'[\:\-\']', '', text)  # Remove specific punctuation
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove special characters
    text = re.sub(r'\d+\.\d+', '', text)  # Matches one or more digits followed by a dot and one or more digits
    text = re.sub(r'\bcom\b', '', text, flags=re.IGNORECASE)  # Matches "com" at word boundaries (whole word)


    # Tokenization
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    if bool:
    # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]


    
    return tokens

In [14]:
preprocessed_texts = [preprocessing(instance['text']) for instance in selected_instances]


In [16]:
##Use Glove vecors 

# Load GloVe vectors (adjust the path to where your GloVe file is located)
def load_glove_vectors(glove_file):
    embeddings_index = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index



In [17]:
def average_word_vectors(tokens, embeddings_index):
    valid_vectors = [embeddings_index[word] for word in tokens if word in embeddings_index]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(100)  # Assuming GloVe 100d vectors



In [47]:
from sklearn.metrics.pairwise import cosine_similarity


glove_file = '/Users/erikrubinov/Desktop/SM24/NLP/glove.6B/glove.6B.100d.txt'  # Path to GloVe file
embeddings_index = load_glove_vectors(glove_file)
sentence_vectors = [average_word_vectors(tokens, embeddings_index) for tokens in preprocessed_texts]
cosine_similarities = cosine_similarity(sentence_vectors)


report = []
for i in range(len(selected_instances)):
    for j in range(i + 1, len(selected_instances)):
        report.append({
            'sentence1_id': selected_instances[i]['id'],
            'sentence2_id': selected_instances[j]['id'],
            'cosine_similarity': cosine_similarities[i, j]
        })

df_report = pd.DataFrame(report)
print("Cosine Similarity Report:")
print(df_report)






Cosine Similarity Report:
    sentence1_id sentence2_id  cosine_similarity
0          14037         1434           0.806338
1          14037         4431           0.886852
2          14037         6701           0.718611
3          14037        11120           0.842180
4          14037         4917           0.904956
..           ...          ...                ...
100        13169        10957           0.568681
101        13169        10311           0.530114
102         1072        10957           0.880561
103         1072        10311           0.845229
104        10957        10311           0.931125

[105 rows x 3 columns]


In [48]:
def cosine_similarities_man(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)



report = []
for i in range(len(selected_instances)):
    for j in range(i + 1, len(selected_instances)):
        cosine_sim = cosine_similarities_man(sentence_vectors[i], sentence_vectors[j])
        report.append({
            'sentence1_id': selected_instances[i]['id'],
            'sentence2_id': selected_instances[j]['id'],
            'cosine_similarity': cosine_sim
        })


# Display the report
import pandas as pd
df_report = pd.DataFrame(report)
print(df_report)

    sentence1_id sentence2_id  cosine_similarity
0          14037         1434           0.806338
1          14037         4431           0.886852
2          14037         6701           0.718610
3          14037        11120           0.842180
4          14037         4917           0.904957
..           ...          ...                ...
100        13169        10957           0.568681
101        13169        10311           0.530114
102         1072        10957           0.880561
103         1072        10311           0.845229
104        10957        10311           0.931125

[105 rows x 3 columns]


In [57]:
# Word2Vec:
# https://drive.usercontent.google.com/download?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download&authuser=0

#Glove:
#https://nlp.stanford.edu/projects/glove/

import gensim.downloader as api
word2vec_model = api.load('word2vec-google-news-300')

In [41]:
import pickle

file = open('word2vec-model.pkl', 'wb')
pickle.dump(word2vec_model, file)
file.close()

In [50]:
model_path = '/Users/erikrubinov/Desktop/SM24/NLP/word2vec-model.pkl'

# Load the model
with open(model_path, 'rb') as f:
    word2vec_model = pickle.load(f)

In [56]:
sentence_vectors = [average_word_vectors(tokens, word2vec_model) for tokens in preprocessed_texts]
cosine_similarities = cosine_similarity(sentence_vectors)



report = []
for i in range(len(selected_instances)):
    for j in range(i + 1, len(selected_instances)):
        cosine_sim = cosine_similarities_man(sentence_vectors[i], sentence_vectors[j])
        report.append({
            'sentence1_id': selected_instances[i]['id'],
            'sentence2_id': selected_instances[j]['id'],
            'cosine_similarity': cosine_sim
        })


# Display the report
import pandas as pd
df_report = pd.DataFrame(report)
print(df_report)


    sentence1_id sentence2_id  cosine_similarity
0          14037         1434           0.594796
1          14037         4431           0.668467
2          14037         6701           0.515029
3          14037        11120           0.628776
4          14037         4917           0.694175
..           ...          ...                ...
100        13169        10957           0.406916
101        13169        10311           0.335479
102         1072        10957           0.737388
103         1072        10311           0.729938
104        10957        10311           0.820351

[105 rows x 3 columns]
