In [1]:
import pandas as pd
import re
from datetime import datetime

# Read the TSV file (replace 'your_dataset.tsv' with your actual file path)
df = pd.read_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/behaviors.tsv', sep='\t', header=None)

# Define the header (you can adjust the column names as needed)
header = [
    "Impression ID", 
    "User ID", 
    "Timestamp", 
    "Displayed News List", 
    "Impression List (Clicked Status)"
]

# Assign headers to the DataFrame
df.columns = header

# 1. Drop rows with missing values in 'User ID', 'Displayed News List', and 'Impression List (Clicked Status)'
df.dropna(subset=['User ID', 'Displayed News List', 'Impression List (Clicked Status)'], inplace=True)

# 2. Impute missing values in other columns with 'Unknown'
df.fillna('Unknown', inplace=True)

# 3. Replace the space in 'Displayed News List' by a comma
df['Displayed News List'] = df['Displayed News List'].apply(lambda x: x.replace(' ', ','))

# 4. Convert 'Impression List (Clicked Status)' to a dictionary where 'NewsID' is key and click status is value
def convert_to_dict(impression_list):
    impression_dict = {}
    impressions = impression_list.split(' ')
    for impression in impressions:
        news_id, click_status = impression.split('-')
        impression_dict[news_id] = int(click_status)  # Convert click status to integer
    return impression_dict

# Apply the conversion function to the 'Impression List (Clicked Status)' column
df['Impression Dictionary'] = df['Impression List (Clicked Status)'].apply(convert_to_dict)

# 5. Split the dictionary into two columns: Clicked News IDs and Not-Clicked News IDs
def split_clicked_not_clicked(impression_dict):
    clicked_news_ids = [news_id for news_id, click_status in impression_dict.items() if click_status == 1]
    not_clicked_news_ids = [news_id for news_id, click_status in impression_dict.items() if click_status == 0]
    return pd.Series([clicked_news_ids, not_clicked_news_ids])

# Apply the split function and create new columns
df[['Clicked News IDs', 'Not-Clicked News IDs']] = df['Impression Dictionary'].apply(split_clicked_not_clicked)


# Save the cleaned data as a CSV file (replace 'cleaned_dataset.csv' with your desired output file path)
df.to_csv('/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/Cleaned Datasets/cleaned_behavior_dataset.csv', index=False)

print("Data has been cleaned and saved to 'cleaned_behavior_dataset.csv'")

Data has been cleaned and saved to 'cleaned_behavior_dataset.csv'
