# Import required libraries

In [1]:
!pip install wget # to download data
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install gensim
!pip install nltk
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
/bin/bash: line 1: python: command not found
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [None]:
%matplotlib inline
import numpy as np
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
import wget
import spacy
from spacy.tokenizer import Tokenizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
import scipy.stats
import tqdm

from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import re
import os
from collections import defaultdict
import jsonlines

import zipfile

%matplotlib widget
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Dataset

In [None]:
# Read the preprocessed dictionary from file
input_file = 'preprocessed_dataset_v2.jsonl'
# input_file = None

non_semantic_tokens = []
semantic_tokens = []

if input_file is not None:

    """
    with jsonlines.open(input_file) as reader:
        for line in tqdm(reader.iter()):
            read_non_semantic_sentences.append(line['non_semantic'])
            read_semantic_sentences.append(line['semantic'])

    non_semantic_sentences = [[token for token in sentence.split()] for sentence in read_non_semantic_sentences]
    semantic_sentences = [[token for token in sentence.split()] for sentence in read_semantic_sentences]
    """

    with jsonlines.open(input_file, 'r') as reader:
        for line in tqdm(reader):
            non_semantic_tokens_for_sentence = line.get("non_semantic", [])
            semantic_tokens_for_sentence = line.get("semantic", [])
            non_semantic_tokens.append(non_semantic_tokens_for_sentence)
            semantic_tokens.append(semantic_tokens_for_sentence)

# Graph structure

In [None]:
import networkx as nx

def build_word_graph(corpus):
    G = nx.Graph()

    # Add nodes for each unique word
    for sentence in tqdm(corpus):
        for word in sentence:
            if word not in G:
                G.add_node(word)

    # Add edges based on co-occurrence in sentences
    for sentence in tqdm(corpus):
        for i in range(len(sentence)):
            for j in range(i+1, len(sentence)):
                if G.has_edge(sentence[i], sentence[j]):
                    G[sentence[i]][sentence[j]]['weight'] += 1
                else:
                    G.add_edge(sentence[i], sentence[j], weight=1)

    return G

# Example corpus
custom_corpus = [['apple', 'orange', 'banana'], ['apple', 'pear', 'kiwi'], ['apple', 'banana', 'kiwi', 'orange']]

# Build the word graph
word_graph = build_word_graph(custom_corpus)

# Accessing edge weights between two words
edge_weight = word_graph['apple']['orange']['weight']
print(f"Weight between 'apple' and 'orange': {edge_weight}")


In [None]:
import matplotlib.pyplot as plt

def display_graph(graph):

    # Clear the graph
    plt.cla()

    pos = nx.spring_layout(graph)  # Positions for all nodes

    # Draw nodes
    nx.draw_networkx_nodes(graph, pos, node_size=700)

    # Draw edges
    nx.draw_networkx_edges(graph, pos, width=2, alpha=0.6)

    # Draw labels
    nx.draw_networkx_labels(graph, pos, font_size=10, font_family="sans-serif")

    # Draw edge labels (weights)
    edge_labels = nx.get_edge_attributes(graph, 'weight')
    nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels)

    # Display the graph
    plt.title("Word Similarity Graph")
    plt.axis('off')  # Turn off the axis
    plt.show()

# Example usage
display_graph(word_graph)


# PageRank

Given a graph, ranks nodes according to their relative structural importance

In [None]:
import networkx as nx

def pagerank(graph, alpha=0.85, max_iter=100, tol=1e-6):
    # Initialize node scores
    scores = {node: 1.0 / len(graph) for node in graph.nodes}

    # PageRank iteration
    for _ in range(max_iter):
        prev_scores = scores.copy()
        for node in graph.nodes:
            rank_sum = sum(graph[neighbor][node].get('weight', 1.0) * prev_scores[neighbor] for neighbor in graph.neighbors(node))
            scores[node] = (1 - alpha) + alpha * rank_sum

        # Check for convergence
        if all(abs(scores[node] - prev_scores[node]) < tol for node in graph.nodes):
            break

    return scores

# Example usage
word_scores = pagerank(word_graph)

# Print word scores
print("Word Scores:")
for word, score in word_scores.items():
    print(f"{word}: {score}")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def pagerank_similarity(word1, word2, graph):
    # Get the edge weights between the two words
    edge_weight_word1 = graph[word1][word2].get('weight', 0.0)
    edge_weight_word2 = graph[word2][word1].get('weight', 0.0)

    # Create vectors from the edge weights
    vector1 = [edge_weight_word1, edge_weight_word2]
    vector2 = [edge_weight_word2, edge_weight_word1]

    # Reshape the vectors for cosine similarity calculation
    vectors = [vector1, vector2]
    similarity_matrix = cosine_similarity(vectors)

    # The similarity score is at position (0, 1) or (1, 0) in the similarity matrix
    similarity_score = similarity_matrix[0, 1]

    return similarity_score

def get_pagerank_similarity_scores(graph):
    
    # Extracting edge weights as a matrix
    edge_weights_matrix = nx.to_numpy_matrix(graph, weight='weight')

    # Computing cosine similarity between word vectors
    similarity_matrix = cosine_similarity(edge_weights_matrix)

    # Creating a dictionary to store word similarities
    word_similarity = {}
    words = list(graph.nodes)

    # Filling the dictionary with similarity scores
    for i in range(len(words)):
        for j in range(i+1, len(words)):
            word_similarity[(words[i], words[j])] = similarity_matrix[i, j]

    return word_similarity

def pagerank_similarity_from_scores(word1, word2, word_similarity_scores):
    # Ensure words are in alphabetical order to maintain consistency
    word_pair = tuple(sorted([word1, word2]))

    # Check if the similarity score is available
    if word_pair in word_similarity_scores:
        return word_similarity_scores[word_pair]
    else:
        return 0.0  # Return 0 if the pair is not found (no similarity)

# Example usage
word1 = 'apple'
word2 = 'orange'
similarity_score = pagerank_similarity(word1, word2, word_graph)

print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")


# Try the simple PageRank implementation on our corpus 

In [None]:
word_graph = build_word_graph(non_semantic_tokens)

pagerank_scores = get_pagerank_similarity_scores(word_graph)

In [None]:
word1 = 'old'
word2 = 'new'
similarity_score = pagerank_similarity_from_scores(word1, word2, pagerank_scores)

print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")