In [None]:
pip install nltk
pip install TextBlob


In [None]:
import pandas as pd
from nltk.corpus import stopwords
import re
from textblob import TextBlob

def convert_to_float(price):
    return float(price.replace('₹', '').replace(',', ''))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0.1:
        return 'Positive'
    elif analysis.sentiment.polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

def preprocess_data(data_file_path):
    df = pd.read_csv(data_file_path)
    df = df.dropna()
    df = df.drop_duplicates()

    df['discounted_price'] = df['discounted_price'].apply(convert_to_float)
    df['actual_price'] = df['actual_price'].apply(convert_to_float)
    df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)
    df['rating'] = pd.to_numeric(df['rating'].astype(str).str.replace('|', ''), errors='coerce')
    df['rating_count'] = df['rating_count'].str.replace(',', '').astype(int)
    
    df['product_name'] = df['product_name'].apply(clean_text)
    df['about_product'] = df['about_product'].apply(clean_text)
    df['review_content'] = df['review_content'].apply(clean_text)
    df['category_text'] = df['category'].apply(clean_text)
    
    df['category'] = df['category'].apply(lambda x: x.split('|') if pd.notnull(x) else x)
    df['sentiment'] = df['review_content'].apply(analyze_sentiment)

    return df

filepath = 'amazon.csv'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


def feature_engineering(data):
    data['combined_text'] = data['product_name'] + ' ' + data['category_text'] + ' ' + data['about_product'] + ' ' + data['review_content']
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2, ngram_range=(1, 1))
    tfidf_matrix = vectorizer.fit_transform(data['combined_text'])

    label_encoder = LabelEncoder()
    data['encoded_sentiment'] = label_encoder.fit_transform(data['sentiment'])

    cosine_sim = cosine_similarity(tfidf_matrix)
    product_user_matrix = data.pivot_table(index='product_id', values='rating', aggfunc='mean').fillna(data['rating'].mean())

    return cosine_sim, product_user_matrix

def hybrid_recommendation(data, product_id, top_n=10):
    idx = data.index[data['product_id'] == product_id][0]

    print(idx)
    cosine_sim, product_user_matrix = feature_engineering(data)

    # Content-based filtering
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    content_recommendations_idx = [i[0] for i in sim_scores[1:top_n+1]]

    # Collaborative Filtering
    if product_id in product_user_matrix.index:
        current_product_rating = product_user_matrix.loc[product_id].values[0]
        similar_rating_products = product_user_matrix.iloc[(product_user_matrix['rating']-current_product_rating).abs().argsort()[:top_n]]

        # Combine recommendations
        collaborative_recommendations_idx = similar_rating_products.index
        collaborative_recommendations_idx = [data.index[data['product_id'] == pid].tolist()[0] for pid in collaborative_recommendations_idx]
        combined_indices = list(set(content_recommendations_idx + collaborative_recommendations_idx))

        recommended_products = data.iloc[combined_indices].copy()
        recommended_products = recommended_products[['product_id', 'product_name', 'rating']]

        return recommended_products

In [None]:
df = preprocess_data(filepath)
df

In [None]:

sample_pid = 'B07JW9H4J1'
hybrid_recommendation(df, sample_pid)

In [None]:
ratings_df = df[['user_id', 'user_name', 'review_id', 'review_title', 'review_content']]

ratings_df['review_content'][0]

In [None]:
ratings_df

In [None]:
df.pivot_table(index='product_id', values='rating', aggfunc='mean').fillna(df['rating'].mean())

In [None]:
products = df.drop(columns=['user_id', 'user_name', 'review_id', 'review_title'])
products['combined_text'] = products['product_name'] + ' ' + products['category_text'] + ' ' + products['about_product'] + ' ' + products['review_content']
products

In [None]:
products.to_dict('records')

In [None]:
most_popular = products.sort_values('rating_count', ascending=False)
most_popular[:10]

In [None]:
most_popular['category'].drop_duplicates()

In [None]:
most_popular['parent_category'] = most_popular['category'].apply(lambda x: x[0])

In [None]:
most_popular.groupby('parent_category').first()

In [None]:
most_popular[:10].to_dict('records')

In [None]:
df[df['product_id'] == df['product_id'][0]]

In [None]:
df.index[df['product_id'] == sample_pid][0]

In [None]:
products

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(products["combined_text"])
X1

In [None]:
from sklearn.decomposition import PCA 
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X1.toarray()) 


In [None]:
X=X1

kmeans = KMeans(n_clusters = 10, init = 'k-means++')
y_kmeans = kmeans.fit_predict(X)
plt.plot(y_kmeans, ".")
plt.show()

In [None]:
from sklearn.cluster import KMeans
wcss = [] 

for i in range(1, 15): 
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(reduced_data) 
    wcss.append(kmeans.inertia_)



In [None]:
plt.plot(range(1,15), wcss, 'bx-')

In [None]:
def print_cluster(i):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

In [None]:
true_k = 15

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X1)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print_cluster(i)

In [None]:
terms[12200]

In [None]:
def show_recommendations(product):
    Y = vectorizer.transform([product])
    prediction = model.predict(Y)
    print(prediction)
    print_cluster(prediction[0])

    return prediction[0]

In [None]:
show_recommendations("food")


In [None]:
products['cluster'] = products['product_name'].apply(show_recommendations)


In [None]:
products['cluster']

In [None]:
products

In [None]:
product_in_same_cluster = products[products['cluster'] == 7]
product_in_same_cluster

In [None]:
product_in_same_cluster.sort_values('rating_count', ascending=False)

In [None]:
products[products['product_id'] == 'B09GFPN6TP']