In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from itertools import combinations
from multiprocessing import Pool
import time

In [20]:
books = pd.read_csv("data_sets/Books_Raw.csv", usecols = ['Title'])
books['book_id'] = None
print(books.head())

                                               Title book_id
0                     Its Only Art If Its Well Hung!    None
1                           Dr. Seuss: American Icon    None
2              Wonderful Worship in Smaller Churches    None
3                      Whispers of the Wicked Saints    None
4  Nation Dance: Religion, Identity and Cultural ...    None


In [11]:
def generate_ngrams(text, n=3):
    tokens = [token for token in text if token != ' ']  # Remove spaces
    n_grams = ngrams(tokens, n)
    return [''.join(gram) for gram in n_grams]


In [16]:
from nltk.util import ngrams
from collections import defaultdict

def assign_book_ids(df, n=3, threshold=0.7):
    book_id = 1
    title_to_id = {}
    ngram_to_id = defaultdict(set)
    
    for index, row in df.iterrows():
        title = row['Title']
        
        # Check for NaN values
        if pd.isna(title):
            continue
        
        # Generate n-grams from the title
        title_ngrams = set(ngrams(title.lower(), n))
        
        # Check if similar title already exists
        assigned_id = None
        for ngram_set, ids in ngram_to_id.items():
            common_ngrams = title_ngrams.intersection(ngram_set)
            similarity = len(common_ngrams) / max(len(ngram_set), len(title_ngrams))
            if similarity > threshold:
                assigned_id = ids.pop()  # Assuming one ID per n-gram set
                break
        
        if assigned_id is None:
            # If no similar title found, assign a new book ID
            assigned_id = book_id
            book_id += 1
        
        # Update title_to_id and ngram_to_id mappings
        title_to_id[title] = assigned_id
        ngram_to_id[tuple(title_ngrams)].add(assigned_id)
        
        # Assign the book ID to the dataframe
        df.at[index, 'book_id'] = assigned_id
    
    return df


In [29]:
df = assign_book_ids(books.head(100))
print(df)

                                                Title book_id
0                      Its Only Art If Its Well Hung!       1
1                            Dr. Seuss: American Icon       2
2                              Whispers of the Wicked       3
3                       Whispers of the Wicked Saints       3
4   Nation Dance: Religion, Identity and Cultural ...       4
..                                                ...     ...
95           Woburn: Forgotten Tales & Untold Stories      95
96  God was in this place & I, i did not know: Fin...      96
97                    Cascade Point and Other Stories      97
98                    The soul of man under socialism      98
99  Prep for PRAXIS: PRAXIS II Exam 2003 (Arco Mas...      99

[100 rows x 2 columns]


In [28]:
books.iloc[2,0] = 'Whispers of the Wicked'
print(books.head())

                                               Title book_id
0                     Its Only Art If Its Well Hung!       1
1                           Dr. Seuss: American Icon       2
2                             Whispers of the Wicked       3
3                      Whispers of the Wicked Saints       3
4  Nation Dance: Religion, Identity and Cultural ...       4
