In [1]:
import pandas as pd
import numpy as np

In [None]:
author_data = pd.read_csv("quillverse author data.csv")
blog_rating = pd.read_csv("quillverse blog data.csv")
medium_blog_data = pd.read_csv("quillverse blog likes.csv")
#print(author_data)
#print(blog_rating)
#print(medium_blog_data)
author_data.head()
blog_rating.head()
medium_blog_data.head()

In [None]:
# Replace "file1.csv", "file2.csv", and "file3.csv" with your actual filenames
file_paths = ["quillverse author data.csv", "quillverse blog data.csv", "quillverse blog likes.csv"]

#Load each CSV file into separate DataFrames
df_list = []
for file_path in file_paths:
  df = pd.read_csv(file_path)
  df_list.append(df)

#Choose the merging method
# Example: Merge based on a common column "id" (ensure all DataFrames have this column)
merged_df = pd.merge(df_list[1], df_list[2], on="blog_id", how="outer")  # Join on "id" column, outer join keeps all rows
merged_df = pd.merge(merged_df, df_list[0], on="author_id", how="outer")  # Join the third DataFrame

# Choose the appropriate "how" parameter:
#   - "inner": Keep only rows with matches in both DataFrames (default)
#   - "outer": Keep all rows from both DataFrames
#   - "left": Keep all rows from the left DataFrame and matching rows from the right
#   - "right": Keep all rows from the right DataFrame and matching rows from the left

merged_df.info()

In [None]:
#Removing/dropping unnecessary columns
merged_df_dropped = merged_df.drop(['blog_img'], axis=1)
merged_df_dropped.head()
#merged_df_dropped.info()
#merged_df_dropped.isnull().sum()

In [None]:
#Displaying info about first blog's content
merged_df_dropped.head(1)['topic']

In [6]:
#For creating a recommendation engine, for each and every blog, we need to create a vector matrix
#b'coz while applying recommendation system that usually is based on PAIR-WISE similarity

#The "blog_content" column is a sentence, a string, so our model cannot understand a sentence,
#so for this we'll be using a NLP concept called TF-IDF(help us to create document matrix from this sentences)

from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3, max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w(1,)',
            ngram_range=(1, 3),
            stop_words = 'english') #This piece of code removes all the unnecessary characters like is, the, a, a comma or fullstops, etc. which are not required

#Filling NoNs with empty strings
merged_df_dropped['topic'] = merged_df_dropped['topic'].fillna('')

In [None]:
#Fit Transform
# Create a TfidfVectorizer object with a lower min_df value #-->ERROR PARAMETER
tfv = TfidfVectorizer(min_df=1)

#Converting into Sparse Matrix(a matrix having a lot of zero values, and very less no. of non-zero values.)
tfv_matrix = tfv.fit_transform(merged_df_dropped['topic'])
tfv_matrix
tfv_matrix.shape #it shows(no. of records, no. of features)

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel
#Sigmoid curve, transforming the input between 0 and 1

sig = sigmoid_kernel(tfv_matrix, tfv_matrix) #how summary 1 is related to summary 1-->this value will be high!
sig[0]  #Dive deeper into this concept for more understanding

In [None]:
#Reverse mapping of indices and blog titles
indices = pd.Series(merged_df_dropped.index, index=merged_df_dropped['blog_title'])
indices

In [None]:
indices['Rights of a Daughter to Ancestral Property']

In [None]:
sig[13]

In [None]:
list(enumerate(sig[indices['Rights of a Daughter to Ancestral Property']]))

In [None]:
sorted(list(enumerate(sig[indices['Rights of a Daughter to Ancestral Property']])), key=lambda x: x[1], reverse=True)

In [14]:
#Last 4 set of codes will be included in a function that we'll create for our Recommendation System
def give_rec(title, sig=sig):
  idx = indices[title]   #Get the indices corresponding to blog_title
  sig_scores = list(enumerate(sig[idx]))    #Get pairwise similarity score
  sig_scores =sorted(sig_scores, key = lambda x: x[1], reverse=True)   #Sort the blogs
  sig_scores = sig_scores[1:7]       #Scores of 6 most similar blogs
  blog_indices = [i[0] for i in sig_scores]      #Blog indices
  return merged_df_dropped['blog_title'].iloc[blog_indices]        #Top 10 most similar blogs

In [None]:
#Now testing our RS
give_rec('Role of Productivity Apps')