Select 200 distinct rows based on poster_id from poster_got_50w.csv and save the result in poster_test_200.csv

In [1]:
import pandas as pd

# Define the file path
file_path = '/home/disk1/red_disk1/Multimodal_MKT/test/sample_310.csv'

# Load the CSV file
df = pd.read_csv(file_path)

# Select 300 distinct poster_id
distinct_poster_ids = df['poster_id'].drop_duplicates().head(300)

# Filter the DataFrame to include only rows with the selected poster_id
filtered_rows = df[df['poster_id'].isin(distinct_poster_ids)]

# Save the new DataFrame to a CSV file
output_path = '/home/disk1/red_disk1/test/poster_test_300.csv'
filtered_rows.to_csv(output_path, index=False)

print(f"New CSV file with rows corresponding to 300 distinct poster_id saved as 'poster_test_300.csv'")


New CSV file with rows corresponding to 300 distinct poster_id saved as 'poster_test_50.csv'


#### Keep fashion posts only

1. Filter rows containing fashion-related keywords in 'post_title' or 'post_content'
2. Remove posts with no post_date or post_like or post_collect or post_comments
3. Set time range from 2023-06-01 to 2024-05-31 to ensure exactly one year

In [2]:
import pandas as pd
import re

# Define the path to the CSV file
file_path = '/home/disk1/red_disk1/Multimodal_MKT/test/sample_310.csv'

# Load the CSV file
df = pd.read_csv(file_path)

# Convert 'post_date' to datetime format
df['post_date'] = pd.to_datetime(df['post_date'], errors='coerce')

# Define the date range
start_date = '2023-06-01'
end_date = '2024-05-31'

# Filter rows based on the date range
df = df[(df['post_date'] >= start_date) & (df['post_date'] <= end_date)]

# Define fashion-related keywords
fashion_keywords = [
    "时尚", "穿搭", "搭配", "潮流", "服装", "造型", "衣服", "新品", "裙子", "裤子", "连衣裙", 
    "上衣", "衬衫", "外套", "牛仔裤", "毛衣", "包包", "鞋子", "首饰", "帽子", "眼镜", "简约", 
    "休闲", "正式", "甜美", "酷", "时髦", "复古", "韩系", "日系", "轻奢", "#OOTD", "#Ootd", 
    "#ootd", "#时尚达人", "#穿搭指南", "#今日穿搭", "#潮流搭配", "#时尚博主", "#新品推荐", 
    "#街拍", "#每日穿搭", "#名牌", "夏装", "冬装", "春装", "秋装", "节日", "婚礼"
]

# Create a regex pattern for the fashion keywords
pattern = re.compile('|'.join(fashion_keywords), re.IGNORECASE)

# Filter rows containing fashion-related keywords in 'post_title' or 'post_content'
df_fashion = df[df['post_title'].str.contains(pattern, na=False) | df['post_content'].str.contains(pattern, na=False)]

# Remove rows with missing information in the specified columns
df_fashion_cleaned = df_fashion.dropna(subset=['post_date', 'post_like', 'post_collect', 'post_comments'])

# Count the number of distinct poster_id and post_id
num_distinct_poster_id = df_fashion_cleaned['poster_id'].nunique()
num_distinct_post_id = df_fashion_cleaned['post_id'].nunique()

# Print the counts
print(f"There are {num_distinct_poster_id} distinct poster_id in the filtered data.")
print(f"There are {num_distinct_post_id} distinct post_id in the filtered data.")

# Save the updated DataFrame to a new CSV file
output_path = '/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion.csv'
df_fashion_cleaned.to_csv(output_path, index=False)

print("Filtered data has been saved to", output_path)


There are 137 distinct poster_id in the filtered data.
There are 5482 distinct post_id in the filtered data.
Filtered data has been saved to /home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion.csv


Select the data from the all_data folder based on poster_id and post_id in poster_test_fashion.csv

In [3]:
import pandas as pd
import os
import shutil

# Define the file paths
csv_file_path = '/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion.csv'
all_data_folder = '/home/disk1/red_disk1/all_data'
output_folder = '/home/disk1/red_disk1/Multimodal_MKT/test/data'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Iterate over each unique combination of poster_id and post_id
for _, row in df.iterrows():
    poster_id = str(row['poster_id'])
    post_id = str(row['post_id'])
    
    # Define the source folder path
    source_post_folder = os.path.join(all_data_folder, poster_id, post_id)
    
    # Define the destination folder path
    dest_post_folder = os.path.join(output_folder, poster_id, post_id)
    
    # Check if the source folder exists
    if os.path.exists(source_post_folder):
        # Copy the entire post_id folder to the destination
        shutil.copytree(source_post_folder, dest_post_folder, dirs_exist_ok=True)
        print(f"Copied data for poster_id {poster_id}, post_id {post_id}")
    else:
        print(f"Source folder for poster_id {poster_id}, post_id {post_id} does not exist")

print("Data selection completed.")

Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 665985df00000000150103e9
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 66585111000000001303f660
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 6657236f000000000c0195f0
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 66568683000000001303dfbf
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 6653eb88000000001501251c
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 6652aac10000000015008a16
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 6652948d000000000c01a4f7
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 6651920f000000001303c673
Copied data for poster_id 596e2a6c6a6a6906aa20764a, post_id 665140e7000000000f00c620
Copied data for poster_id 6191e8000000000010007643, post_id 65cf8cac0000000007007b5c
Copied data for poster_id 5dff75100000000001001bb3, post_id 661f89140000000003020f2c
Copied data for poster_id 5dff75100000000001001bb3, post_id 65f94

KeyboardInterrupt: 

Count the number of poster_id directories in the all_data or test/data folder:

In [15]:
import os

# Define the path to the all_data folder
all_data_folder = '/home/disk1/red_disk1/all_data'
# all_data_folder = '/home/disk1/red_disk1/test/data'

# Get a list of all items in the all_data folder
all_items = os.listdir(all_data_folder)

# Filter the list to include only directories (which represent poster_id)
poster_ids = [item for item in all_items if os.path.isdir(os.path.join(all_data_folder, item))]

# Count the number of poster_id directories
num_poster_ids = len(poster_ids)

# Initialize a set to store unique post_ids
unique_post_ids = set()

# Iterate through each poster_id directory to count distinct post_id files
for poster_id in poster_ids:
    poster_id_path = os.path.join(all_data_folder, poster_id)
    post_files = os.listdir(poster_id_path)
    for post_file in post_files:
        post_id, _ = os.path.splitext(post_file)  # Extract post_id without file extension
        unique_post_ids.add(post_id)

# Count the number of distinct post_ids
num_post_ids = len(unique_post_ids)

# Print the results
print(f"There are {num_poster_ids} poster_id directories in the test/data folder.")
print(f"There are {num_post_ids} distinct post_id files in the test/data folder.")



There are 10335 poster_id directories in the test/data folder.
There are 709613 distinct post_id files in the test/data folder.
