In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

In [None]:

# Read the CSV file and limit to the first 5000 rows
items = pd.read_csv('train_users.csv')
# Count the number of occurrences for each category in the 'title_fa_category' column
category_counts = items['title_fa_category'].value_counts()
# Display the counts for each category
print(category_counts)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, StructType, StructField, StringType
from pyspark.ml.feature import VectorAssembler, StandardScaler
import numpy as np

# Initialize Spark session
spark = SparkSession.builder.appName("DistributedIncrementalCosineSimilarity").getOrCreate()

# Define schema for the sample data
schema = StructType([
    StructField("id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("rate", FloatType(), True)
])

# Hardcoded sample data
data = [
    ("1", "Item1", 4.0),
    ("2", "Item2", 5.0),
    ("3", "Item3", 3.0),
    ("4", "Item4", 2.0),
    ("5", "Item5", 4.5),
    ("6", "Item6", 3.5),
    ("7", "Item7", 4.0),
    ("8", "Item8", 5.0),
    ("9", "Item9", 2.5),
    ("10", "Item10", 3.5)
]

# Create DataFrame from sample data
df = spark.createDataFrame(data, schema=schema)

# Display the DataFrame
print("Initial DataFrame:")
df.show()

# Convert 'rate' column to FloatType (already correct in this case)
df = df.withColumn("rate", col("rate").cast(FloatType()))

# Vectorize the rate column
assembler = VectorAssembler(inputCols=["rate"], outputCol="features")
df = assembler.transform(df)

# Display DataFrame after vectorization
print("DataFrame after VectorAssembler:")
df.select("rate", "features").show()

# Normalize the ratings
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)

# Fit the scaler and transform the data
try:
    scaler_model = scaler.fit(df)
    df = scaler_model.transform(df)
except Exception as e:
    print(f"Error occurred during StandardScaler fit: {e}")

# Display DataFrame after scaling
print("DataFrame after StandardScaler:")
df.select("scaled_features").show()

# Function to calculate cosine similarity incrementally
def cosine_similarity_incremental(chunk_df, broadcast_chunk):
    similarities = []
    chunk_array = np.array(chunk_df.select("scaled_features").rdd.map(lambda x: x[0].toArray()).collect())
    broadcast_array = np.array(broadcast_chunk.value.select("scaled_features").rdd.map(lambda x: x[0].toArray()).collect())
    
    for i, vec1 in enumerate(chunk_array):
        for j, vec2 in enumerate(broadcast_array):
            if i != j:  # Skip self-similarity
                sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
                similarities.append((chunk_df.collect()[i]['id'], broadcast_chunk.value.collect()[j]['id'], sim))
    return similarities

# Split data into chunks and compute similarities
num_chunks = 2
total_rows = df.count()
chunk_size = total_rows // num_chunks

# Initialize a list to store all similarities
all_similarities = []

# Create chunks
for i in range(num_chunks):
    # Create a chunk DataFrame
    lower_bound = i * chunk_size
    upper_bound = (i + 1) * chunk_size
    chunk_df = df.limit(upper_bound).subtract(df.limit(lower_bound))
    
    # Broadcast the chunk for similarity computation
    broadcast_chunk = spark.sparkContext.broadcast(chunk_df.collect())
    
    # Compute similarities for the current chunk
    similarities = cosine_similarity_incremental(chunk_df, broadcast_chunk)
    all_similarities.extend(similarities)

# Print out a few examples of the computed similarities
print("Computed Similarities:")
for sim in all_similarities[:10]:  # Display first 10 similarities as an example
    print(f"Item {sim[0]} and Item {sim[1]} have similarity {sim[2]:.4f}")

# Stop the Spark session
spark.stop()


In [1]:
import pandas as pd

# Read the CSV files into DataFrames
comments_df = pd.read_csv('digikala/2-comments.csv')
products_df = pd.read_csv('digikala/5-products.csv',low_memory=False)

In [2]:
#removing unnecessary data
#products_df = products_df.drop(columns=['product_attributes'])

new_comments_df = comments_df.groupby('product_id', as_index=False).agg({
    'title': lambda x: ' '.join(str(i) for i in x.unique()),  # Convert to string and concatenate unique titles
    'comment': lambda x: ' '.join(str(i) for i in x),          # Convert to string and concatenate all comments
    'advantages': lambda x: ' '.join(str(i) for i in x),       # Convert to string and concatenate all advantages
    'disadvantages': lambda x: ' '.join(str(i) for i in x)    
})
print(new_comments_df['comment'][0])


راستش رو بخواین وقتی رفتم کارتریجش زرو شارژ کنم یه فردی که ۳۰ سال کارش پرینتر و از این جور چیزا بود بهم گفت پرینتر ماهی خریدی...
حالا دیگه تصمیم با شما برای کارهای اداری عالیه و به درد اونایی که زیاد پرینت میگیرن بیشتر میخوره حرف نداره برای کارای سنگین محشره بهترین پرینتری که تا به حال استفاده کردم
کم مصرف و بصرفه . لوازم مصرفی با پایینترین هزینه تعویض و سرویس میشه
در یک کلام بهترین برای پرینت تعداد بالا
اما قیمت الان بالاس
 4 سال پیش اکبند خریدم 480 تومن با کارتریج اصلی . سه تا کارتریج دیگه هم همراهش خریدم و تا الان فقط با همین کارتریجها استفاده کردم . فقط شارژ کردم و لوازم مصرفی عوض کردم و سرویس و مجدد پرینت زدم.
دفتر فنی و امور اینترنتی دارم و تعداد پرینتم بالاس . البته در کنارش یک دستگاه کپی شارپ هم دارم ولی میانگین در روز بیش از 200 برگ پرینت رو باهاش میزنم . فوق العادس
تنها ایرادش نداشتن wifi هست بسیار عالی
پرینتری بینظیر


In [3]:
new_comments_df.to_csv('new_comment.csv', index=False)

In [4]:

# Merge the DataFrames on 'product_id' from comments_df and 'id' from products_df
merged_df = pd.merge( products_df, new_comments_df,left_on='id', right_on='product_id')

# Optional: Save the resulting DataFrame to a new CSV file
merged_df.to_csv('merged_output.csv', index=False)


In [5]:
merged_df = pd.read_csv('merged_output.csv')

# Count the number of unique product IDs
unique_product_ids = merged_df['product_id'].nunique()

print(f"Number of unique product IDs: {unique_product_ids}")

Number of unique product IDs: 5708


In [6]:
merged_df = merged_df.fillna('')


In [7]:

import string

merged_df['combined_features'] = merged_df['product_title_en'] + ' ' + merged_df['category_title_fa']+ ' ' + merged_df['title_alt']+ ' ' + merged_df['category_keywords']+ ' ' + merged_df['brand_name_fa'] + ' ' + merged_df['brand_name_en']
#print(merged_df['combined_features'])
nan_product_id = merged_df['product_title_en'].isna().sum()
nan_combined_features = merged_df['combined_features'].isna().sum()
# Remove all punctuation
merged_df['combined_features'] = merged_df['combined_features'].str.replace(f'[{string.punctuation}]', '', regex=True)

print(nan_product_id, nan_combined_features)
print(merged_df['combined_features'])
print(merged_df.shape)

0 0
0       Savoy Burn Relief Spray 50ml کرم و ژل ترمیم کن...
1        ساعت مچی عقربه‌ای ساعت،ساعت زنانه،ساعت ارزان،...
2        کیف پول کیف ، کارت ، کارت ویزیت ، کارت بانکی ...
3        رو بالشی روبالشی ، کالای خواب ، روتختی ، کودک...
4        مایو مایو مردانه مایو ورزشیشلوارک ورزشیشلوارک...
                              ...                        
5703     اسپری اسپری،خوشبو کننده،بدن،مردانه،ضد تعریق،د...
5704    Unique 1727 Flask 06 Liter کلمن و فلاسک فلاسک ...
5705     آبکش و آبگیر  آبکش و آبگیر VirgoBasin متفرقه ...
5706     سایر لوازم تزئینی بادبزن بادبزن دستی باد بزن ...
5707     استند و پایه خنک کننده پایه خنک کننده سادیتا ...
Name: combined_features, Length: 5708, dtype: object
(5708, 16)


In [8]:

# Split the text into separate words
#merged_df['combined_features'] = merged_df['combined_features'].str.split()

#print(merged_df['combined_features'])

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['combined_features'])
print(tfidf_matrix.shape)
words = tfidf_vectorizer.get_feature_names_out()

# Print the words
print(words[9000])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=words)
#tfidf_df.to_csv('tfidf_matrix.csv', index=False)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert the cosine similarity matrix to a DataFrame for easy interpretation
similarity_df = pd.DataFrame(cosine_sim, index=merged_df['product_id'], columns=merged_df['product_id'])

# Sort each column individually in descending order


print(similarity_df)
print('723227' in similarity_df.index)


(5708, 14315)
جولری
product_id    723227    725786  725253    727088    724768    724514  \
product_id                                                             
723227      1.000000  0.000000     0.0  0.000000  0.000000  0.000000   
725786      0.000000  1.000000     0.0  0.030206  0.004061  0.774263   
725253      0.000000  0.000000     1.0  0.000000  0.000000  0.000000   
727088      0.000000  0.030206     0.0  1.000000  0.002981  0.000000   
724768      0.000000  0.004061     0.0  0.002981  1.000000  0.000000   
...              ...       ...     ...       ...       ...       ...   
785459      0.240759  0.008408     0.0  0.006170  0.015087  0.000000   
823647      0.000000  0.000000     0.0  0.000000  0.000000  0.000000   
732747      0.000000  0.008316     0.0  0.006103  0.004687  0.000000   
731861      0.000000  0.004249     0.0  0.003118  0.002395  0.000000   
773275      0.055109  0.000000     0.0  0.000000  0.000000  0.000000   

product_id    727638    724295    727310   

In [10]:
target_product_id = 726479

# Extract similarity scores for the target product_id
similarities = similarity_df[target_product_id]

# Drop the target product_id from the results (since it's the most similar to itself)
#similarities = similarities.drop(target_product_id)

# Sort the similarity scores in descending order and get the top 2 items
top_similar_items = similarities.sort_values(ascending=False).head(5)

# Print the top 2 most similar items
print("Top 2 most similar items to item", target_product_id)
print(top_similar_items)

Top 2 most similar items to item 726479
product_id
768343    1.0
827011    1.0
827018    1.0
725356    1.0
744605    1.0
Name: 726479, dtype: float64


In [11]:
###cleaning comment data
import re
# Replace 'nan' text with empty string
merged_df['advantages'] = merged_df['advantages'].str.replace('nan', '', regex=False)  # Remove 'nan'
merged_df['disadvantages'] = merged_df['disadvantages'].str.replace('nan', '', regex=False)

# Remove '\r' characters
merged_df['advantages'] = merged_df['advantages'].str.replace('r', '', regex=False)
merged_df['disadvantages'] = merged_df['disadvantages'].str.replace('r', '', regex=False)

merged_df['advantages'] = merged_df['advantages'].str.replace('\\', '', regex=False)
merged_df['disadvantages'] = merged_df['disadvantages'].str.replace('\\', '', regex=False)

merged_df['advantages'] = merged_df['advantages'].fillna('')
merged_df['disadvantages'] = merged_df['disadvantages'].fillna('')
# Display the cleaned DataFrame
merged_df['advantages'] = merged_df['advantages'].apply(lambda x: re.sub(r'[^\w\s,]', '', x))  # Remove all punctuation except ','
merged_df['disadvantages'] = merged_df['disadvantages'].apply(lambda x: re.sub(r'[^\w\s,]', '', x))  # Remove all punctuation except ','

merged_df['advantages'] = merged_df['advantages'].str.replace(',', ' ', regex=False)
merged_df['disadvantages'] = merged_df['disadvantages'].str.replace(',', ' ', regex=False)

print(merged_df['advantages'])

0                          
1                          
2                          
3        قیمت مناسب زیپ دار
4         قیمت عالی و کیفیت
               ...         
5703               ماندگاری
5704                   زیبا
5705                       
5706                   عالی
5707                       
Name: advantages, Length: 5708, dtype: object


In [12]:
merged_df['combined_text'] = merged_df['advantages'] + ' ' + merged_df['disadvantages'] + ' ' + merged_df['comment']

# Strip any leading/trailing whitespace
merged_df['combined_text'] = merged_df['combined_text'].str.strip()
print(merged_df['combined_text'])

0       این اسپری دارای فرمول انحصاری برای کشور انگلست...
1       با سلام چند وقتی خریداری کردم موتور داغون هنوز...
2                                     خوبه ارزش خرید داره
3       قیمت مناسب زیپ دار   دو تیکه پارچه به شکل کامل...
4             قیمت عالی و کیفیت  جنس و کیفیت وقیمتش عالیه
                              ...                        
5703    ماندگاری بوی معمولی این اسپری بوی همه چیز میده...
5704            زیبا ندارد خیلی خیلی آب رو گرم نگه میدارد
5705               خیلی چیز خوبیه راضیم خیلی به درد بخوره
5706    عالی   فوق العاده بی کیفیت سلام ممنون ازبابت ب...
5707    عایه من خریدم خیلی راضی هستم ازش سلام دوستان ک...
Name: combined_text, Length: 5708, dtype: object


In [13]:

from hazm import *

# Initialize Hazm components
normalizer = Normalizer()
word_tokenizer = WordTokenizer()
lemmatizer = Lemmatizer()
stopwords = stopwords_list()

# Step 1: Normalize the text in the 'combined_text' column
merged_df['normalized_text'] = merged_df['combined_text'].apply(lambda x: normalizer.normalize(x))

# Step 2: Tokenize the normalized text into words
merged_df['tokenized_text'] = merged_df['normalized_text'].apply(lambda x: word_tokenizer.tokenize(x))

# Step 3: Remove stopwords from the tokenized text
merged_df['filtered_text'] = merged_df['tokenized_text'].apply(lambda words: [word for word in words if word not in stopwords])

# Step 4: Lemmatize the filtered words
merged_df['lemmatized_text'] = merged_df['filtered_text'].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])

# Step 5: Combine the lemmatized words back into a single string
merged_df['processed_text'] = merged_df['lemmatized_text'].apply(lambda words: ' '.join(words))

# Display the DataFrame with the new processed text
print(merged_df[['product_id', 'processed_text']])


      product_id                                     processed_text
0         723227  اسپری فرمول انحصاری کشور انگلستان هست فوق‌العا...
1         725786  سلام خریداری موتور داغون یکهفته نشده خواب میمو...
2         725253                                خوبه ارزش خرید داره
3         727088  قیمت زیپ تیکه پارچه شکل ساده بهم دوخته سرش یه ...
4         724768                  قیمت کیفیت جنس کیفیت وقیمتش عالیه
...          ...                                                ...
5703      785459  ماندگاری بوی معمولی اسپری بوی میده جزء دیزایر بلو
5704      823647                           زیبا آب گرم نگه داشت#دار
5705      732747                                خوبیه راض درد بخوره
5706      731861  فوق‌العاده بی‌کیفیت سلام ممنون ازباب موقع سفار...
5707      773275  عایه خرید#خر راضی #هست ازش سلام دوستان کیفیت ،...

[5708 rows x 2 columns]


In [14]:
tfidf_vectorizer_features = TfidfVectorizer()
tfidf_matrix_features = tfidf_vectorizer_features.fit_transform(merged_df['combined_features'])
print(f"TF-IDF matrix shape for combined_features: {tfidf_matrix_features.shape}")

# Vectorize comments
tfidf_vectorizer_comments = TfidfVectorizer()
tfidf_matrix_comments = tfidf_vectorizer_comments.fit_transform(merged_df['combined_text'])
print(f"TF-IDF matrix shape for comments: {tfidf_matrix_comments.shape}")

# Calculate cosine similarity for combined_features
cosine_sim_features = cosine_similarity(tfidf_matrix_features, tfidf_matrix_features)

# Calculate cosine similarity for comments
cosine_sim_comments = cosine_similarity(tfidf_matrix_comments, tfidf_matrix_comments)

# Weighting
title_category_weight = 0.6
comments_weight = 0.4

# Normalize the weights (assuming we want the weights to add up to 1)
combined_similarity = (title_category_weight * cosine_sim_features +
                       comments_weight * cosine_sim_comments)

# Create a DataFrame from the combined similarity matrix
similarity_df2 = pd.DataFrame(combined_similarity, index=merged_df['product_id'], columns=merged_df['product_id'])

# Display the combined similarity matrix
print(similarity_df2)

TF-IDF matrix shape for combined_features: (5708, 14315)
TF-IDF matrix shape for comments: (5708, 17504)
product_id    723227    725786    725253    727088    724768    724514  \
product_id                                                               
723227      1.000000  0.000000  0.000000  0.011409  0.000000  0.000000   
725786      0.000000  1.000000  0.000000  0.028762  0.002437  0.464558   
725253      0.000000  0.000000  1.000000  0.056827  0.000000  0.019641   
727088      0.011409  0.028762  0.056827  1.000000  0.016512  0.002790   
724768      0.000000  0.002437  0.000000  0.016512  1.000000  0.000000   
...              ...       ...       ...       ...       ...       ...   
785459      0.183208  0.005045  0.000000  0.003702  0.009052  0.000000   
823647      0.004057  0.000000  0.000000  0.000000  0.000000  0.000000   
732747      0.000000  0.004989  0.000000  0.022956  0.002812  0.000000   
731861      0.019250  0.010817  0.000000  0.013338  0.038923  0.000000   
773275 

In [15]:
target_product_id = 726479

# Extract similarity scores for the target product_id
similarities = similarity_df2[target_product_id]

# Drop the target product_id from the results (since it's the most similar to itself)
#similarities = similarities.drop(target_product_id)

# Sort the similarity scores in descending order and get the top 2 items
top_similar_items = similarities.sort_values(ascending=False).head(5)

# Print the top 2 most similar items
print("Top 5 most similar items to item", target_product_id)
print(top_similar_items)

Top 5 most similar items to item 726479
product_id
726479    1.000000
744555    0.714731
725421    0.699731
827018    0.681821
768343    0.661208
Name: 726479, dtype: float64
