# Recomendation

### Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

### Load dataset (user acess)

In [None]:


# Directory containing the CSV files
input_folder = 'C:/Users/eng3/Documents/pos/Tech Challenger 5/dataset/files/treino'

dataframes = []

for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):  
        file_path = os.path.join(input_folder, file_name)
        df = pd.read_csv(file_path)
        dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

df_user_acc = combined_df

required_columns = ['history', 'timestampHistory', 'numberOfClicksHistory', 'timeOnPageHistory', 
                    'scrollPercentageHistory', 'pageVisitsCountHistory']
df_user_acc = df_user_acc.dropna(subset=required_columns)

df_user_acc.head()


### Load dataset (news metadata)

In [None]:


# Directory containing the CSV files
input_folder = 'C:/Users/eng3/Documents/pos/Tech Challenger 5/dataset/itens/itens/filtered_output'

dataframes = []

for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):  
        file_path = os.path.join(input_folder, file_name)
        df = pd.read_csv(file_path)
        dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

df_news_acc = combined_df

df_news_acc.head()


In [None]:
# Step 1: Preprocess df_news_acc to create a mapping
page_to_article_type = df_news_acc.set_index('page')['article-type'].to_dict()

# Step 2: Define a faster function using the preprocessed dictionary
def process_values(row):
    # Remove spaces and split history into IDs
    news_ids = row.replace(" ", "").split(',')
    
    # Map IDs to their article types using the dictionary
    ret = [page_to_article_type.get(news, "Unknown") for news in news_ids]
    
    # Join the result as a comma-separated string
    retString = ""

    try:
        retString = ",".join(ret)
    except:
        retString = ""
        
    return retString


# Step 3: Apply the optimized function to the 'history' column
df_user_acc['article_types'] = df_user_acc['history'].apply(process_values)

print(df_user_acc.head())


In [None]:
# Export to CSV
df_user_acc.to_csv('proceded.csv', index=False)

print("DataFrame exported to 'proceded.csv'")

#### Content Based Recommendation

In [5]:
df_user_acc['history'] = df_user_acc['history'].apply(lambda x: x.split(', '))
df_user_acc['article_types'] = df_user_acc['article_types'].apply(lambda x: x.split(','))

In [None]:
# Preprocess multi-value columns



# Function to recommend articles
def recommend_content_based(user_id, users_data, articles_data):
    # Get user data
    user_row = users_data[users_data['userId'] == user_id]
    if user_row.empty:
        return []
    
    # Extract user preferences
    user_history = set(user_row['history'].iloc[0])  # Articles already read
    user_article_types = set(user_row['article_types'].iloc[0])  # User's interests
    
    # Filter articles the user hasn't read
    potential_articles = articles_data[~articles_data['page'].isin(user_history)]
    
    # Recommend articles that match the user's preferred types
    recommendations = potential_articles[potential_articles['article-type'].isin(user_article_types)]
    
    # Sort recommendations by the 'modified' column (most recent first)
    recommendations = recommendations.sort_values(by='modified', ascending=False)
    
    # Return recommended articles
    return recommendations[['page', 'url', 'article-type']].to_dict(orient='records')





# Test the recommendation system for a user
#user_id = "f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069"
user_id = "fake1"
recommendations = recommend_content_based(user_id, df_user_acc, df_news_acc)
print("Recommended articles:", recommendations)

In [None]:
# Parse multi-value columns
data['history'] = data['history'].apply(lambda x: x.split(', '))
data['timestampHistory'] = data['timestampHistory'].apply(lambda x: [int(ts) for ts in x.split(', ')])
data['article_type'] = data['article_type'].apply(lambda x: x.split())

# Function to recommend articles based on content similarity
def recommend_articles(user_id, data, article_metadata):
    user_row = data[data['userId'] == user_id]
    if user_row.empty:
        return []
    
    # Extract user's article types and engagement history
    user_article_types = user_row['article_type'].iloc[0]
    
    # Match articles with similar types
    recommendations = []
    for article_id, types in article_metadata.items():
        if any(t in user_article_types for t in types):  # Check for type overlap
            recommendations.append(article_id)
    
    # Exclude already visited articles
    visited = set(user_row['history'].iloc[0])
    recommendations = [r for r in recommendations if r not in visited]
    
    return recommendations