Select 200 distinct rows based on poster_id from poster_got_50w.csv and save the result in poster_test_200.csv

In [1]:
import pandas as pd

# Define the file path
file_path = '/home/disk1/red_disk1/poster_got_50w.csv'

# Load the CSV file
df = pd.read_csv(file_path)

# Select 50 distinct rows based on poster_id
distinct_rows = df.drop_duplicates(subset=['poster_id']).head(300)

# Save the new DataFrame to a CSV file
output_path = '/home/disk1/red_disk1/test/poster_test_300.csv'
distinct_rows.to_csv(output_path, index=False)

print("New CSV file with 200 distinct rows saved as 'poster_test_300.csv'")


New CSV file with 200 distinct rows saved as 'poster_test_300.csv'


#### Keep fashion posts only

1. Filter rows containing fashion-related keywords in 'post_title' or 'post_content'
2. Remove posts with no post_date or post_like or post_collect or post_comments
3. Set time range from 2023-06-01 to 2024-05-31 to ensure exactly one year

In [2]:
import pandas as pd
import re

# Define the path to the CSV file
file_path = '/home/disk1/red_disk1/test/poster_test_300.csv'

# Load the CSV file
df = pd.read_csv(file_path)

# Convert 'post_date' to datetime format
df['post_date'] = pd.to_datetime(df['post_date'], errors='coerce')

# Define the date range
start_date = '2023-06-01'
end_date = '2024-05-31'

# Filter rows based on the date range
df = df[(df['post_date'] >= start_date) & (df['post_date'] <= end_date)]

# Define fashion-related keywords
fashion_keywords = [
    "时尚", "穿搭", "搭配", "潮流", "服装", "造型", "衣服", "新品", "裙子", "裤子", "连衣裙", 
    "上衣", "衬衫", "外套", "牛仔裤", "毛衣", "包包", "鞋子", "首饰", "帽子", "眼镜", "简约", 
    "休闲", "正式", "甜美", "酷", "时髦", "复古", "韩系", "日系", "轻奢", "#OOTD", "#Ootd", 
    "#ootd", "#时尚达人", "#穿搭指南", "#今日穿搭", "#潮流搭配", "#时尚博主", "#新品推荐", 
    "#街拍", "#每日穿搭", "#名牌", "夏装", "冬装", "春装", "秋装", "节日", "婚礼"
]

# Create a regex pattern for the fashion keywords
pattern = re.compile('|'.join(fashion_keywords), re.IGNORECASE)

# Filter rows containing fashion-related keywords in 'post_title' or 'post_content'
df_fashion = df[df['post_title'].str.contains(pattern, na=False) | df['post_content'].str.contains(pattern, na=False)]

# Remove rows with missing information in the specified columns
df_fashion_cleaned = df_fashion.dropna(subset=['post_date', 'post_like', 'post_collect', 'post_comments'])

# Save the updated DataFrame to a new CSV file
output_path = '/home/disk1/red_disk1/test/poster_test_fashion.csv'
df_fashion_cleaned.to_csv(output_path, index=False)

print("Filtered data has been saved to", output_path)


Filtered data has been saved to /home/disk1/red_disk1/test/poster_test_fashion.csv


Select the data from the all_data folder based on poster_id and post_id in poster_test_fashion.csv

In [3]:
import pandas as pd
import os
import shutil

# Define the file paths
csv_file_path = '/home/disk1/red_disk1/test/poster_test_fashion.csv'
all_data_folder = '/home/disk1/red_disk1/all_data'
output_folder = '/home/disk1/red_disk1/test/data'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Iterate over each unique poster_id
for poster_id in df['poster_id'].unique():
    poster_id = str(poster_id)
    
    # Define the source folder path
    source_poster_folder = os.path.join(all_data_folder, poster_id)
    
    # Define the destination folder path
    dest_poster_folder = os.path.join(output_folder, poster_id)
    
    # Check if the source folder exists
    if os.path.exists(source_poster_folder):
        # Copy the entire poster_id folder to the destination
        shutil.copytree(source_poster_folder, dest_poster_folder, dirs_exist_ok=True)
        print(f"Copied all data for poster_id {poster_id}")
    else:
        print(f"Source folder for poster_id {poster_id} does not exist")

print("Data selection completed.")


Copied all data for poster_id 58ce075f5e87e757f8ea60c9
Copied all data for poster_id 57a346a67fc5b869366dd8f9
Copied all data for poster_id 5958d3d56a6a691265623f01
Copied all data for poster_id 5c454bb900000000070116a5
Copied all data for poster_id 5b9f6f33b2e7dc000145640f
Copied all data for poster_id 5b3c364911be10556aeb83ef
Copied all data for poster_id 579dc04982ec39466917e9c6
Copied all data for poster_id 58c116366a6a69510d4d5fca
Copied all data for poster_id 5a20fd8e4eacab7ff7591319
Copied all data for poster_id 5c54fd6d000000001803403f
Copied all data for poster_id 5b2b31fe6b58b76236e580ca
Copied all data for poster_id 59e0b13c20e88f68e8af29a0
Copied all data for poster_id 5a898d2ce8ac2b3ae82f7ee7
Copied all data for poster_id 55bd601267bc653fea5862da
Copied all data for poster_id 5483c528d6e4a91b71c94320
Copied all data for poster_id 595989005e87e7786f165159
Copied all data for poster_id 5945698250c4b465df6fac5d
Copied all data for poster_id 5b8a337bba87a80001c85604
Copied all

Count the number of poster_id directories in the all_data or test/data folder:

In [5]:
import os

# Define the path to the all_data folder
# all_data_folder = '/home/disk1/red_disk1/all_data'
all_data_folder = '/home/disk1/red_disk1/test/data'

# Get a list of all items in the all_data folder
all_items = os.listdir(all_data_folder)

# Filter the list to include only directories (which represent poster_id)
poster_ids = [item for item in all_items if os.path.isdir(os.path.join(all_data_folder, item))]

# Count the number of poster_id directories
num_poster_ids = len(poster_ids)

# print(f"There are {num_poster_ids} poster_id directories in the all_data folder.")
print(f"There are {num_poster_ids} poster_id directories in the test/data folder.")


There are 126 poster_id directories in the test/data folder.
