In [2]:
# All packages

from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import pandas as pd
import numpy as np

#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
## Dataset ##

# loading dataset
dataset = pd.read_csv('/content/drive/MyDrive/AI Project/Bangla books details data/books.csv', on_bad_lines='skip')

dataset = dataset.rename(columns={"urlID|Title|Author|Publisher|Price|DiscountedPrice|Discount|Category|ISBN|Edition|Pages|Country|Language|Ratings|RatingsNum|Reviews": "Column"})

# splitting into multiple column
dataset[['urlID', 'Title', 'Author', 'Publisher', 'Price', 'DiscountedPrice', 'Discount', 'Category', 'ISBN', 'Edition', 'Pages', 'Country', 'Language', 'Ratings', 'RatingsNum', 'Reviews']] = dataset['Column'].str.split('|', 16, expand=True)

# removing duplicate occurance of same book
dataset = dataset.drop_duplicates(subset=['Title', 'Author'])

# removing information about books which are not in Bangla
dataset = dataset[dataset['Language'] == 'Bangla']

# removing unnecessary columns
dataset = dataset.drop(columns=['Column', 'ISBN', 'Edition', 'Language', 'urlID', 'DiscountedPrice', 'Discount', 'Reviews'])

# dropping the rows having null values
dataset = dataset.dropna()



# # making numeric
dataset['Ratings'] = pd.to_numeric(dataset['Ratings'], errors='coerce')
dataset['RatingsNum'] = pd.to_numeric(dataset['RatingsNum'], errors='coerce')
dataset['Price'] = pd.to_numeric(dataset['Price'], errors='coerce')
dataset['Pages'] = pd.to_numeric(dataset['Pages'], errors='coerce')
dataset['RatingsNum'] = dataset['RatingsNum'].multiply(10)

## Remove outliers
dataset = dataset[(dataset['Pages'] >= 0) & (dataset['Pages'] <= 20000)]
dataset = dataset[(dataset['Price'] >= 0) & (dataset['Price'] <= 50000)]

## Handeling Missing values
dataset['Price'] = dataset['Price'].replace([np.inf, -np.inf, np.nan], 0)
priceSum = dataset['Price'].sum()
meanPrice = priceSum / dataset[dataset['Price'] != 0].shape[0]
meanPrice = meanPrice.astype(int)
dataset.loc[dataset['Price'] == 0, 'Price'] = meanPrice

dataset['Pages'] = dataset['Pages'].replace([np.inf, -np.inf, np.nan], 0)
pagesSum = dataset['Pages'].sum()
meanPages = pagesSum / dataset[dataset['Pages'] != 0].shape[0]
meanPages = meanPages.astype(int)
dataset.loc[dataset['Pages'] == 0, 'Pages'] = meanPages

dataset['Ratings'] = dataset['Ratings'].replace([np.inf, -np.inf, np.nan], 0)
RatingsSum = dataset['Ratings'].sum()
meanRatings = RatingsSum / dataset[dataset['Ratings'] != 0].shape[0]
meanRatings = meanRatings.round(1).astype(float)
dataset.loc[dataset['Ratings'] == 0, 'Ratings'] = meanRatings

dataset['RatingsNum'] = dataset['RatingsNum'].replace([np.inf, -np.inf, np.nan], 0)
dataset['RatingsNum'] = dataset['RatingsNum'].astype(int)
RatingsNumSum = dataset['RatingsNum'].sum()
meanRatingsNum = RatingsNumSum / dataset[dataset['RatingsNum'] != 0].shape[0]
meanRatingsNum = meanRatingsNum.astype(int)
dataset.loc[dataset['RatingsNum'] == 0, 'RatingsNum'] = meanRatingsNum

# # df_sorted = dataset.sort_values('RatingsNum')
# # df_sorted

from google.colab import files
dataset.to_csv('output.csv', encoding = 'utf-8-sig')
files.download('output.csv')


  dataset[['urlID', 'Title', 'Author', 'Publisher', 'Price', 'DiscountedPrice', 'Discount', 'Category', 'ISBN', 'Edition', 'Pages', 'Country', 'Language', 'Ratings', 'RatingsNum', 'Reviews']] = dataset['Column'].str.split('|', 16, expand=True)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
dataset

Unnamed: 0,Title,Author,Publisher,Price,Category,Pages,Country,Ratings,RatingsNum
0,সুশাসনের সন্ধানে,আতিউর রহমান,অন্যপ্রকাশ,276,প্রসঙ্গ বাংলাদেশ,215,Bangladesh,5.0,30
1,শেষের কবিতা,রবীন্দ্রনাথ ঠাকুর,বিশ্বসাহিত্য ভবন,90,চিরায়ত উপন্যাস,78,Bangladesh,4.6,100
2,নৌকাডুবি,রবীন্দ্রনাথ ঠাকুর,বিশ্বসাহিত্য ভবন,158,চিরায়ত উপন্যাস,156,Bangladesh,5.0,40
3,গোরা,রবীন্দ্রনাথ ঠাকুর,বিশ্বসাহিত্য ভবন,255,চিরায়ত উপন্যাস,239,Bangladesh,5.0,30
4,চোখের বালি,রবীন্দ্রনাথ ঠাকুর,বিশ্বসাহিত্য ভবন,158,চিরায়ত উপন্যাস,156,Bangladesh,5.0,40
...,...,...,...,...,...,...,...,...,...
142344,জীবনানন্দ দাশের শ্রেষ্ঠ গল্প,মামুনুর রহমান,সমাচার,350,বইমেলা ২০১৮,352,Bangladesh,4.9,20
142345,তনু-মন,ইব্রাহিম,তুষারধারা,120,বইমেলা ২০১৮,177,Bangladesh,4.9,20
142395,ও নদীরে,আসাদ চৌধুরী,প্রজ্জ্বলন প্রকাশ,135,বইমেলা ২০১৮,177,Bangladesh,4.9,20
142396,স্রোত,মামুনুল ইসলাম,প্রজ্জ্বলন প্রকাশ,180,বইমেলা ২০১৮,177,Bangladesh,4.9,20


In [6]:
feature_weights = {
    'Title':0.9,
    'Author': 0.8,
    'Category': 0.75,
    'Ratings': 0.65,
    'RatingsNum': 0.65,
    'Price': 0.6,
    'Publisher': 0.5,
    'Pages': 0.4,
    'Country': 0.2
}

sorted_features = sorted(feature_weights, key=feature_weights.get, reverse=True)

selected_columns = sorted_features + [col for col in dataset.columns if col not in sorted_features]
processed_dataset = dataset[selected_columns]

processed_dataset


Unnamed: 0,Title,Author,Category,Ratings,RatingsNum,Price,Publisher,Pages,Country
0,সুশাসনের সন্ধানে,আতিউর রহমান,প্রসঙ্গ বাংলাদেশ,5.0,30,276,অন্যপ্রকাশ,215,Bangladesh
1,শেষের কবিতা,রবীন্দ্রনাথ ঠাকুর,চিরায়ত উপন্যাস,4.6,100,90,বিশ্বসাহিত্য ভবন,78,Bangladesh
2,নৌকাডুবি,রবীন্দ্রনাথ ঠাকুর,চিরায়ত উপন্যাস,5.0,40,158,বিশ্বসাহিত্য ভবন,156,Bangladesh
3,গোরা,রবীন্দ্রনাথ ঠাকুর,চিরায়ত উপন্যাস,5.0,30,255,বিশ্বসাহিত্য ভবন,239,Bangladesh
4,চোখের বালি,রবীন্দ্রনাথ ঠাকুর,চিরায়ত উপন্যাস,5.0,40,158,বিশ্বসাহিত্য ভবন,156,Bangladesh
...,...,...,...,...,...,...,...,...,...
142344,জীবনানন্দ দাশের শ্রেষ্ঠ গল্প,মামুনুর রহমান,বইমেলা ২০১৮,4.9,20,350,সমাচার,352,Bangladesh
142345,তনু-মন,ইব্রাহিম,বইমেলা ২০১৮,4.9,20,120,তুষারধারা,177,Bangladesh
142395,ও নদীরে,আসাদ চৌধুরী,বইমেলা ২০১৮,4.9,20,135,প্রজ্জ্বলন প্রকাশ,177,Bangladesh
142396,স্রোত,মামুনুল ইসলাম,বইমেলা ২০১৮,4.9,20,180,প্রজ্জ্বলন প্রকাশ,177,Bangladesh


In [7]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
# Download the 'punkt' resource
nltk.download('punkt')

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**WhiteSpace Removal**

In [11]:
! pip install bnlp_toolkit
! pip install banglanltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bnlp_toolkit
  Downloading bnlp_toolkit-3.3.1-py3-none-any.whl (22 kB)
Collecting sentencepiece (from bnlp_toolkit)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn-crfsuite (from bnlp_toolkit)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting ftfy (from bnlp_toolkit)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji==1.7.0 (from bnlp_toolkit)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing m

In [16]:
processed_dataset['Title'].apply(lambda x:[i.replace(r"\s+","")for i in x])

0          [স, ু, শ, া, স, ন, ে, র,  , স, ন, ্, ধ, া, ন, ে]
1                         [শ, ে, ষ, ে, র,  , ক, ব, ি, ত, া]
2                                  [ন, ৌ, ক, া, ড, ু, ব, ি]
3                                              [গ, ো, র, া]
4                            [চ, ো, খ, ে, র,  , ব, া, ল, ি]
                                ...                        
142344    [জ, ী, ব, ন, া, ন, ন, ্, দ,  , দ, া, শ, ে, র, ...
142345                                   [ত, ন, ু, -, ম, ন]
142395                                [ও,  , ন, দ, ী, র, ে]
142396                                      [স, ্, র, ো, ত]
142397                 [ত, ু, ম, ি, ও,  , ক, া, ঁ, দ, ব, ে]
Name: Title, Length: 102327, dtype: object

In [None]:
###tokenization + TF-IDF matrix#####
##matrix e shob value zero ashe :( ####


def preprocess_bangla_text(text):
    # Remove special characters and symbols specific to Bangla language
    text = re.sub(r'[^\w\sঀ-৾]', '', text)

    # Remove leading and trailing whitespace
    text = text.strip()

    return text

# Apply preprocessing function to the columns before tokenization
dataset['Title'] = dataset['Title'].apply(preprocess_bangla_text)
dataset['Author'] = dataset['Author'].apply(preprocess_bangla_text)
dataset['Category'] = dataset['Category'].apply(preprocess_bangla_text)
dataset['Publisher'] = dataset['Publisher'].apply(preprocess_bangla_text)
# # Tokenize the Bangla text columns using nltk
dataset['Tokenized_Title'] = dataset['Title'].apply(lambda x: word_tokenize(x))
dataset['Tokenized_Author'] = dataset['Author'].apply(lambda x: word_tokenize(x))
dataset['Tokenized_Category'] = dataset['Category'].apply(lambda x: word_tokenize(x))
dataset['Tokenized_Publisher'] = dataset['Publisher'].apply(lambda x: word_tokenize(x))
# # Add more columns as needed
# # Combine the tokenized text columns into a single column
dataset['Tokenized_Text'] = dataset['Tokenized_Title'] + dataset['Tokenized_Author']+ dataset['Tokenized_Category']+ dataset['Tokenized_Publisher']
# Modify the concatenation as per your column requirements

# # Convert the tokenized text into a string representation
dataset['Tokenized_Text'] = dataset['Tokenized_Text'].apply(lambda x: ' '.join(x))

# # Apply TF-IDF on the tokenized text
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Tokenized_Text'])

# # Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# # Display the TF-IDF matrix
print(tfidf_df)


         10   12   13   15   16   18  1857   19  1947খ  19702010  ...  ৯৪৬৪  \
0       0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
1       0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
2       0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
3       0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
4       0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
...     ...  ...  ...  ...  ...  ...   ...  ...    ...       ...  ...   ...   
102322  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
102323  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
102324  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
102325  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   
102326  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0    0.0       0.0  ...   0.0   

         ৯৫  ৯৫৯৬   ৯৬   ৯৭  ৯৭৯৮   ৯৮   ৯৯  ৯৯৩  ৯

In [None]:
#####tokenization + count vectorization######
####tf-idf kore shob zero ashe tai, eti method try kora. ekhanew zero ashe :(( #####


from sklearn.feature_extraction.text import CountVectorizer
def preprocess_bangla_text(text):
    # Remove special characters and symbols specific to Bangla language
    text = re.sub(r'[^\w\sঀ-৾]', '', text)

    # Remove leading and trailing whitespace
    text = text.strip()

    return text

# Apply preprocessing function to the columns before tokenization
dataset['Title'] = dataset['Title'].apply(preprocess_bangla_text)
dataset['Author'] = dataset['Author'].apply(preprocess_bangla_text)
dataset['Category'] = dataset['Category'].apply(preprocess_bangla_text)
dataset['Publisher'] = dataset['Publisher'].apply(preprocess_bangla_text)

# Tokenize the Bangla text columns using nltk
dataset['Tokenized_Title'] = dataset['Title'].apply(lambda x: word_tokenize(x))
dataset['Tokenized_Author'] = dataset['Author'].apply(lambda x: word_tokenize(x))
dataset['Tokenized_Category'] = dataset['Category'].apply(lambda x: word_tokenize(x))
dataset['Tokenized_Publisher'] = dataset['Publisher'].apply(lambda x: word_tokenize(x))
# Add more columns as needed

# Combine the tokenized text columns into a single column
dataset['Tokenized_Text'] = dataset['Tokenized_Title'] + dataset['Tokenized_Author'] + dataset['Tokenized_Category'] + dataset['Tokenized_Publisher']
# Modify the concatenation as per your column requirements

# Convert the tokenized text into a string representation
dataset['Tokenized_Text'] = dataset['Tokenized_Text'].apply(lambda x: ' '.join(x))

# Apply count vectorization on the tokenized text
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(dataset['Tokenized_Text'])

# Convert the count matrix to a DataFrame
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

# Display the count matrix
print(count_df)

        10  12  13  15  16  18  1857  19  1947খ  19702010  ...  ৯৪৬৪  ৯৫  \
0        0   0   0   0   0   0     0   0      0         0  ...     0   0   
1        0   0   0   0   0   0     0   0      0         0  ...     0   0   
2        0   0   0   0   0   0     0   0      0         0  ...     0   0   
3        0   0   0   0   0   0     0   0      0         0  ...     0   0   
4        0   0   0   0   0   0     0   0      0         0  ...     0   0   
...     ..  ..  ..  ..  ..  ..   ...  ..    ...       ...  ...   ...  ..   
102322   0   0   0   0   0   0     0   0      0         0  ...     0   0   
102323   0   0   0   0   0   0     0   0      0         0  ...     0   0   
102324   0   0   0   0   0   0     0   0      0         0  ...     0   0   
102325   0   0   0   0   0   0     0   0      0         0  ...     0   0   
102326   0   0   0   0   0   0     0   0      0         0  ...     0   0   

        ৯৫৯৬  ৯৬  ৯৭  ৯৭৯৮  ৯৮  ৯৯  ৯৯৩  ৯৯৯  
0          0   0   0     0   0   0    0 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(count_matrix, count_matrix)

# Create a DataFrame from the cosine similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=dataset.index, columns=dataset.index)

# Display the cosine similarity matrix
print(cosine_sim_df)


NameError: ignored