In [25]:
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Load datasets
interactions = pd.read_csv('dataset/User_interaction.csv')
meta = pd.read_csv('datasetMetadata.csv')


In [27]:
# Preprocessing
meta['category'] = meta['category_name'].fillna('General')
meta['reading_time'] = meta['reading_time'].fillna(meta['reading_time'].median())


In [28]:
# Time-based split
interactions = interactions.sort_values('updated_at')
train = interactions.iloc[:int(0.75*len(interactions))]
test = interactions.iloc[int(0.75*len(interactions)):]


In [30]:
# Train SVD model
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(train[['user_id', 'pratilipi_id', 'read_percent']], reader)
trainset = data.build_full_trainset()


In [31]:
svd = SVD(n_factors=50, n_epochs=20)
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2940c95c8b0>

In [32]:
# Create content features
tfidf = TfidfVectorizer()
category_matrix = tfidf.fit_transform(meta['category_name'])


In [33]:
scaler = MinMaxScaler()
meta['norm_reading_time'] = scaler.fit_transform(meta[['reading_time']])


In [46]:
class HybridRecommender:
    def __init__(self, svd_model, tfidf_matrix, meta):
        self.svd = svd_model
        self.tfidf_matrix = tfidf_matrix
        self.meta = meta
        self.pratilipi_ids = meta['pratilipi_id'].values
        
    def recommend(self, user_id, top_n=5):
        # Collaborative predictions
        collab_scores = np.array([self.svd.predict(user_id, pid).est for pid in self.pratilipi_ids])
        
        # Content-based scores
        user_history = train[train['user_id'] == user_id]['pratilipi_id']
        content_scores = self._get_content_scores(user_history)
        
        # Combine scores
        hybrid_scores = 0.7 * collab_scores + 0.3 * content_scores
        top_indices = hybrid_scores.argsort()[-top_n:][::-1]
        
        return self.pratilipi_ids[top_indices]
    
    def _get_content_scores(self, user_history):
        if user_history.empty:
            return np.zeros(len(self.pratilipi_ids))
            
        # Get indices of user history
        history_idx = [np.where(self.pratilipi_ids == pid)[0][0] for pid in user_history]
        
        # Calculate similarity
        sim_scores = self.tfidf_matrix[history_idx].mean(axis=0)
        return sim_scores + self.meta['norm_reading_time'].values


In [47]:
# Initialize recommender
recommender = HybridRecommender(svd, category_matrix, meta)

In [48]:
recommender.recommend(1)

array([1377786224233118, 1377786225918211, 1377786225918489,
       1377786225918759, 1377786225918845], dtype=int64)

In [38]:
def cold_start_recommendations(top_n=5):
    # Use content-based popularity (reading time + category diversity)
    popular = meta.sort_values(['reading_time', 'category'], ascending=[False, True])
    return popular['pratilipi_id'].head(top_n).tolist()

print("Popular recommendations:", cold_start_recommendations())

Popular recommendations: [1300795932448456, 291449283974856, 291449283974856, 940266639496904, 1090208461919944]


In [50]:
def generate_all_test_recommendations(test_data, top_n=5):
    """
    Generate recommendations for all users in test data
    Returns DataFrame with columns: [user_id, recommended_pratilipis]
    """
    results = []
    
    # Get unique test users
    test_users = test_data['user_id'].unique()
    
    for user_id in test_users:
        try:
            # Get recommendations
            recommendations = recommender.recommend(user_id, top_n)
            
            # Handle new users with cold start
            if len(recommendations) == 0:
                recommendations = get_cold_start_recommendations(top_n)
                
            results.append({
                'user_id': user_id,
                'recommended_pratilipis': recommendations
            })
            
        except Exception as e:
            print(f"Error processing user {user_id}: {str(e)}")
            results.append({
                'user_id': user_id,
                'recommended_pratilipis': []
            })
    
    return pd.DataFrame(results)

def get_cold_start_recommendations(top_n=5):
    """Fallback recommendations for new users"""
    return meta.sort_values('reading_time', ascending=False)['pratilipi_id'].head(top_n).tolist()

# Generate recommendations for all test users
test_recommendations = generate_all_test_recommendations(test)

# Save to CSV
test_recommendations.to_csv('test_recommendations.csv', index=False)

# Display sample results
print("Sample recommendations:")
print(test_recommendations.head())

KeyboardInterrupt: 