In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!gdown --id 1aA9o8PJ-9gYAcLaaxLmXUrxlrHtArlrh -O Dataset_Misinfo_Fake.zip
!gdown --id 1BhaOQU5wYDL8IxOzgvZM-IlgYfJf3HFD -O Dataset_Misinfo_True.zip

!unzip Dataset_Misinfo_Fake.zip
!unzip Dataset_Misinfo_True.zip

Downloading...
From (original): https://drive.google.com/uc?id=1aA9o8PJ-9gYAcLaaxLmXUrxlrHtArlrh
From (redirected): https://drive.google.com/uc?id=1aA9o8PJ-9gYAcLaaxLmXUrxlrHtArlrh&confirm=t&uuid=8fe92308-8733-4a7a-9be2-9688a235787d
To: /content/Dataset_Misinfo_Fake.zip
100% 45.1M/45.1M [00:01<00:00, 26.8MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1BhaOQU5wYDL8IxOzgvZM-IlgYfJf3HFD
From (redirected): https://drive.google.com/uc?id=1BhaOQU5wYDL8IxOzgvZM-IlgYfJf3HFD&confirm=t&uuid=751abef0-206e-4afb-a773-06f05bef8b5c
To: /content/Dataset_Misinfo_True.zip
100% 43.0M/43.0M [00:00<00:00, 44.3MB/s]
Archive:  Dataset_Misinfo_Fake.zip
  inflating: DataSet_Misinfo_FAKE.csv  
Archive:  Dataset_Misinfo_True.zip
  inflating: DataSet_Misinfo_TRUE.csv  


In [None]:
true_df = pd.read_csv('/content/DataSet_Misinfo_TRUE.csv')
fake_df = pd.read_csv('/content/DataSet_Misinfo_FAKE.csv')

In [None]:
true_df.head()

Unnamed: 0.1,Unnamed: 0,text
0,0,The head of a conservative Republican faction ...
1,1,Transgender people will be allowed for the fir...
2,2,The special counsel investigation of links bet...
3,3,Trump campaign adviser George Papadopoulos tol...
4,4,President Donald Trump called on the U.S. Post...


In [None]:
fake_df.head()

Unnamed: 0.1,Unnamed: 0,text
0,0,Donald Trump just couldn t wish all Americans ...
1,1,House Intelligence Committee Chairman Devin Nu...
2,2,"On Friday, it was revealed that former Milwauk..."
3,3,"On Christmas day, Donald Trump announced that ..."
4,4,Pope Francis used his annual Christmas Day mes...


In [None]:
# Add labels to indicate real and fake news
true_df['label'] = 1
fake_df['label'] = 0

# Concatenate both datasets
data = pd.concat([true_df, fake_df])
data.head()


Unnamed: 0.1,Unnamed: 0,text,label
0,0,The head of a conservative Republican faction ...,1
1,1,Transgender people will be allowed for the fir...,1
2,2,The special counsel investigation of links bet...,1
3,3,Trump campaign adviser George Papadopoulos tol...,1
4,4,President Donald Trump called on the U.S. Post...,1


In [None]:
print(data.shape)
print("\nMissing values:")
print(data.isnull().sum())
print("\nDuplicates count: ", data.duplicated(subset=['text']).sum())

(78617, 3)

Missing values:
Unnamed: 0     0
text          29
label          0
dtype: int64

Duplicates count:  10012


In [None]:
data.dropna(subset=['text'], inplace=True)
print(data.isnull().sum())

data.drop_duplicates(subset=['text'], inplace=True)
print(data.duplicated(subset=['text']).sum())

Unnamed: 0    0
text          0
label         0
dtype: int64
0


In [None]:
data['id'] = np.arange(1, len(data) + 1)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,id
0,0,The head of a conservative Republican faction ...,1,1
1,1,Transgender people will be allowed for the fir...,1,2
2,2,The special counsel investigation of links bet...,1,3
3,3,Trump campaign adviser George Papadopoulos tol...,1,4
4,4,President Donald Trump called on the U.S. Post...,1,5


In [None]:
text_for_id = data[data['id'] == 68578]['text'].iloc[0]
text_for_id

'NATO is developing a new plan of military operations in Europe that involves countering mythical "Russian aggression", the rapid deployment of troops from all over the continent and direct military support from the United States. Although Moscow has no plans to attack Europe, it is compelled to take defensive measures in light of NATO’s eastward expansion.'

### EDA

In [None]:
import pandas as pd
import requests
import time
import json
from typing import Dict, Any, Optional, List
import os
import numpy as np
import random

MISTRAL_API_KEY: Optional[str] = 'MISTRAL_API_KEY'

if not MISTRAL_API_KEY:
    MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')

API_MODEL_NAME: str = "pixtral-12b-2409"
API_ENDPOINT: str = "https://api.mistral.ai/v1/chat/completions"

IS_MOCK_MODE: bool = False
if not MISTRAL_API_KEY:
    IS_MOCK_MODE = True
    print(f"WARN: MISTRAL_API_KEY is not set. Running in MOCK MODE.")
    print(f"WARN: To use real API, set MISTRAL_API_KEY = 'your_actual_key' in the script")
    print(f"WARN: Or set environment variable: export MISTRAL_API_KEY='your_actual_key'")
else:
    print("INFO: Running with provided MISTRAL_API_KEY. Actual API calls will be made.")

PROCESSING_CHUNK_LOG_INTERVAL = 100
INITIAL_RETRY_DELAY = 10
MAX_RETRIES = 7
BASE_API_DELAY = 0

RATE_LIMIT_DELAY = 10
EXPONENTIAL_BACKOFF_MAX = 60
JITTER_RANGE = 0.05

output_column_name = 'subject_category'
ID_COLUMN_NAME = 'id'

def add_jitter(delay: float) -> float:
    """Add random jitter to delay to avoid synchronized requests"""
    jitter = delay * JITTER_RANGE * (2 * random.random() - 1)  # ±10% jitter
    return max(0.1, delay + jitter)

def get_prompt(article_text: str) -> str:
    processed_article_text: str = str(article_text).replace('"""', '\'\'\'')
    categories = [
        'politics', 'government', 'usNews', 'worldNews', 'middleEastNews',
        'technology', 'science', 'health', 'business', 'finance', 'sports',
        'entertainment', 'propaganda', 'socialIssues', 'environment',
        'education', 'crime', 'legal', 'other'
    ]
    category_list_string = ", ".join([f"'{cat}'" for cat in categories])
    return f"""Analyze the following news article snippet and classify its main subject into **exactly one** of the following categories:
{category_list_string}.

Choose the single most relevant category from the list. Your response should be ONLY the category name, nothing else.

Article Snippet:
\"\"\"
{processed_article_text}
\"\"\"

Category:"""

def get_subject_from_api(article_text: str, article_index_info: str) -> str:
    if not article_text.strip():
        return "N/A - Empty Input Text"

    if IS_MOCK_MODE:
        time.sleep(0.01)
        article_lower = article_text.lower()

        if any(word in article_lower for word in ['election', 'vote', 'candidate', 'republican', 'democrat', 'senate', 'congress', 'political']):
            return "politics"
        elif any(word in article_lower for word in ['government', 'federal', 'administration', 'policy', 'official', 'agency']):
            return "government"
        elif any(word in article_lower for word in ['united states', 'america', 'u.s.', 'american', 'domestic']):
            return "usNews"
        elif any(word in article_lower for word in ['international', 'world', 'global', 'foreign', 'country', 'nation']):
            return "worldNews"
        elif any(word in article_lower for word in ['middle east', 'israel', 'palestine', 'syria', 'iraq', 'iran']):
            return "middleEastNews"
        elif any(word in article_lower for word in ['technology', 'tech', 'software', 'computer', 'digital', 'ai', 'artificial intelligence']):
            return "technology"
        elif any(word in article_lower for word in ['science', 'research', 'study', 'scientist', 'discovery']):
            return "science"
        elif any(word in article_lower for word in ['health', 'medical', 'hospital', 'doctor', 'disease', 'vaccine']):
            return "health"
        elif any(word in article_lower for word in ['business', 'company', 'corporate', 'market', 'economy', 'economic']):
            return "business"
        elif any(word in article_lower for word in ['finance', 'financial', 'money', 'bank', 'stock', 'investment']):
            return "finance"
        elif any(word in article_lower for word in ['sports', 'game', 'team', 'player', 'championship', 'athletic']):
            return "sports"
        elif any(word in article_lower for word in ['entertainment', 'movie', 'music', 'celebrity', 'hollywood', 'film']):
            return "entertainment"
        elif any(word in article_lower for word in ['propaganda', 'misinformation', 'fake news']):
            return "propaganda"
        elif any(word in article_lower for word in ['social', 'community', 'rights', 'equality', 'discrimination', 'protest']):
            return "socialIssues"
        elif any(word in article_lower for word in ['environment', 'climate', 'pollution', 'green', 'sustainability']):
            return "environment"
        elif any(word in article_lower for word in ['education', 'school', 'university', 'student', 'teacher']):
            return "education"
        elif any(word in article_lower for word in ['crime', 'criminal', 'police', 'arrest', 'investigation']):
            return "crime"
        elif any(word in article_lower for word in ['legal', 'law', 'court', 'judge', 'lawsuit', 'attorney']):
            return "legal"
        else:
            return "other"

    prompt = get_prompt(article_text)

    payload: Dict[str, Any] = {
        "model": API_MODEL_NAME,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.1,
        "max_tokens": 20,
        "top_p": 0.9
    }

    headers: Dict[str, str] = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {MISTRAL_API_KEY}'
    }

    current_retry_delay = INITIAL_RETRY_DELAY
    consecutive_rate_limits = 0

    for attempt in range(MAX_RETRIES):
        try:
            response = requests.post(API_ENDPOINT, json=payload, headers=headers, timeout=60)
            response.raise_for_status()

            consecutive_rate_limits = 0

            api_response_json: Dict[str, Any] = response.json()

            choices = api_response_json.get("choices")
            if choices and isinstance(choices, list) and len(choices) > 0:
                choice = choices[0]
                if choice and isinstance(choice, dict):
                    message = choice.get("message")
                    if message and isinstance(message, dict):
                        content = message.get("content")
                        if content and isinstance(content, str):
                            return content.strip()

            error_msg = f"Error: Could not parse category (Article: {article_index_info}). Unexpected API structure."
            print(f"WARN: {error_msg} Response: {json.dumps(api_response_json, indent=2)}")
            return error_msg

        except requests.exceptions.HTTPError as http_err:
            if http_err.response.status_code == 429:
                consecutive_rate_limits += 1

                rate_limit_delay = min(
                    RATE_LIMIT_DELAY * (2 ** consecutive_rate_limits),
                    EXPONENTIAL_BACKOFF_MAX
                )
                rate_limit_delay = add_jitter(rate_limit_delay)

                print(f"WARN: Rate limit hit for Article {article_index_info}. Attempt {attempt + 1}/{MAX_RETRIES}. "
                      f"Waiting {rate_limit_delay:.1f}s (consecutive rate limits: {consecutive_rate_limits})...")
                time.sleep(rate_limit_delay)

                if attempt == MAX_RETRIES - 1:
                    print(f"ERROR: Max retries reached for Article {article_index_info} due to rate limiting.")
                    return f"API Error: HTTP 429 Rate Limit (Max Retries)"

            elif http_err.response.status_code in [503, 500, 502]:  # Server errors
                print(f"WARN: HTTP {http_err.response.status_code} for Article {article_index_info}. "
                      f"Attempt {attempt + 1}/{MAX_RETRIES}. Retrying in {current_retry_delay}s...")
                time.sleep(add_jitter(current_retry_delay))
                current_retry_delay = min(current_retry_delay * 2, EXPONENTIAL_BACKOFF_MAX)

                if attempt == MAX_RETRIES - 1:
                    print(f"ERROR: Max retries reached for Article {article_index_info}. HTTP {http_err.response.status_code}.")
                    return f"API Error: HTTP {http_err.response.status_code} (Max Retries)"
            else:
                error_text = ""
                try:
                    error_text = http_err.response.text[:200]
                except:
                    error_text = "Could not read error response"
                print(f"ERROR: HTTPError {http_err.response.status_code} for Article {article_index_info}: {error_text}")
                return f"API Error: HTTP {http_err.response.status_code} (Critical)"

        except requests.exceptions.Timeout:
            print(f"WARN: Timeout for Article {article_index_info}. Attempt {attempt + 1}/{MAX_RETRIES}. "
                  f"Retrying in {current_retry_delay}s...")
            time.sleep(add_jitter(current_retry_delay))
            current_retry_delay = min(current_retry_delay * 2, EXPONENTIAL_BACKOFF_MAX)
            if attempt == MAX_RETRIES - 1:
                print(f"ERROR: Max retries reached for Article {article_index_info} due to Timeout.")
                return "API Error: Timeout (Max Retries)"

        except requests.exceptions.RequestException as req_err:
            print(f"ERROR: RequestException for Article {article_index_info}: {req_err}")
            return "API Error: Request Failed (Critical)"

        except Exception as e:
            print(f"ERROR: Unexpected error for Article {article_index_info}: {e} (Type: {type(e).__name__})")
            return f"Error: Unexpected ({type(e).__name__}) (Critical)"

    return "API Error: Max Retries Reached (Exited Loop) (Critical)"

def process_range(df_full: pd.DataFrame, start_row_idx: int, end_row_idx: int, start_id: int, end_id: int):
    """
    Processes a specific range of rows in the DataFrame.
    start_row_idx: 0-based integer start position (inclusive)
    end_row_idx: 0-based integer end position (exclusive)
    start_id: ID value of the first row to process
    end_id: ID value of the last row to process
    """
    if start_row_idx < 0 or end_row_idx > len(df_full) or start_row_idx >= end_row_idx:
        print(f"ERROR: Invalid row range. Start: {start_row_idx}, End: {end_row_idx}, Total Rows: {len(df_full)}")
        return

    df_slice = df_full.iloc[start_row_idx:end_row_idx].copy()

    if output_column_name not in df_slice.columns:
        df_slice[output_column_name] = pd.NA

    df_slice[output_column_name] = df_slice[output_column_name].apply(
        lambda x: pd.NA if isinstance(x, str) and
        (x.startswith("Error:") or x.startswith("API Error:") or x in ["N/A - Empty Text", "N/A - Empty Input Text"])
        else x
    )

    try:
        slice_text_col_idx = df_slice.columns.get_loc('text')
        slice_output_col_idx = df_slice.columns.get_loc(output_column_name)
        slice_id_col_idx = df_slice.columns.get_loc(ID_COLUMN_NAME) if ID_COLUMN_NAME in df_slice.columns else None
    except KeyError as e:
        print(f"ERROR: Required column missing in the slice: {e}")
        return

    num_rows_in_range = len(df_slice)
    print(f"\nProcessing range: IDs {start_id} to {end_id} (Total in this range: {num_rows_in_range})")

    stop_processing_flag = False
    processed_in_this_range = 0

    for i in range(num_rows_in_range):
        actual_df_index_label = df_slice.index[i]
        actual_df_row_number = start_row_idx + i + 1

        # Get actual ID value from the dataframe
        id_value = df_slice.iloc[i, slice_id_col_idx] if slice_id_col_idx is not None else "N/A"

        if (processed_in_this_range % PROCESSING_CHUNK_LOG_INTERVAL == 0) or PROCESSING_CHUNK_LOG_INTERVAL == 1:
            print(f"\nProcessing item {processed_in_this_range + 1}/{num_rows_in_range} in current range (ID: {id_value})...")

        current_category_scalar = df_slice.iloc[i, slice_output_col_idx]
        if pd.notna(current_category_scalar):
            print(f"ID {id_value}: Already processed with '{current_category_scalar}'. Skipping.")
            processed_in_this_range += 1
            continue

        article_text_scalar_val = df_slice.iloc[i, slice_text_col_idx]
        article_index_info = f"ID {id_value} (Row {actual_df_row_number}, Index Label: {actual_df_index_label})"

        if pd.isna(article_text_scalar_val) or not str(article_text_scalar_val).strip():
            print(f"{article_index_info}: Text is empty/NaN. Marking 'N/A - Empty Text'.")
            df_slice.iloc[i, slice_output_col_idx] = "N/A - Empty Text"
        else:
            extracted_category = get_subject_from_api(str(article_text_scalar_val), article_index_info)
            df_slice.iloc[i, slice_output_col_idx] = extracted_category
            print(f"{article_index_info}: Extracted: '{extracted_category}'")

            if "(Max Retries)" in extracted_category or "(Critical)" in extracted_category:
                print(f"CRITICAL ERROR on {article_index_info}. Stopping processing for this range.")
                stop_processing_flag = True

        processed_in_this_range += 1

        if not IS_MOCK_MODE and not stop_processing_flag:
            if not (isinstance(df_slice.iloc[i, slice_output_col_idx], str) and
                    df_slice.iloc[i, slice_output_col_idx].startswith("API Error:")):
                delay = add_jitter(BASE_API_DELAY)
                if delay > 0.1:
                    print(f"Waiting {delay:.1f}s before next request...")
                    time.sleep(delay)

        if stop_processing_flag:
            break

    range_output_filename = f"subjects_id_{start_id}_to_{end_id}.csv"
    try:
        columns_to_save = [ID_COLUMN_NAME, output_column_name]

        if ID_COLUMN_NAME not in df_slice.columns:
            print(f"ERROR: ID column '{ID_COLUMN_NAME}' not found in DataFrame slice.")
            return

        df_to_save = df_slice[columns_to_save].copy()

        df_to_save.to_csv(range_output_filename, index=False, encoding='utf-8')

        status_msg = "STOPPED_DUE_TO_ERROR" if stop_processing_flag else "COMPLETED_RANGE"
        print(f"\n--- Processing for ID range {start_id}-{end_id} {status_msg}. ---")
        print(f"{processed_in_this_range} rows attempted in this range. Results saved to '{range_output_filename}'")
        print(f"CSV file contains columns: {list(df_to_save.columns)}")
        print(f"ID range in saved file: {df_to_save[ID_COLUMN_NAME].min()} to {df_to_save[ID_COLUMN_NAME].max()}")

    except Exception as e:
        print(f"ERROR: Could not save results for ID range {start_id}-{end_id} to '{range_output_filename}': {e}")

if __name__ == "__main__":
    try:
        if 'data' not in locals() or not isinstance(data, pd.DataFrame):
            print("ERROR: DataFrame 'data' not found. Load it before running.")
            exit()
        print(f"INFO: Using pre-existing DataFrame 'data' with {len(data)} rows.")
    except Exception as e:
        print(f"ERROR: Could not access 'data' DataFrame: {e}")
        exit()

    if 'text' not in data.columns:
        print("ERROR: Your DataFrame 'data' must have a 'text' column.")
        exit()
    if ID_COLUMN_NAME and ID_COLUMN_NAME not in data.columns:
        print(f"ERROR: Specified ID_COLUMN_NAME '{ID_COLUMN_NAME}' not found in 'data' DataFrame.")
        print(f"Available columns: {data.columns.tolist()}")
        print("If you want to use the DataFrame index, set ID_COLUMN_NAME = None.")
        exit()

    try:
        start_id_value = 1

        max_id = len(data)
        if start_id_value < 1 or start_id_value > max_id:
            print(f"ERROR: ID value {start_id_value} is out of range.")
            print(f"Valid ID range: 1 to {max_id}")
            exit()

        manual_start_row_position = start_id_value - 1

        print(f"INFO: Requested start from ID value {start_id_value}, which is at DataFrame position {manual_start_row_position}.")

    except Exception as e:
        print(f"ERROR: Unexpected error when calculating start position by ID: {e}")
        exit()

    num_rows_to_process_manually = 68604
    manual_end_row_position = min(manual_start_row_position + num_rows_to_process_manually, len(data))

    start_id_actual = manual_start_row_position + 1
    end_id_actual = manual_end_row_position

    if manual_start_row_position >= len(data):
        print(f"INFO: Manual start row position {manual_start_row_position} is beyond the DataFrame length ({len(data)}). Nothing to process.")
    else:
        if IS_MOCK_MODE:
            print(f"INFO: Running in MOCK MODE with enhanced keyword matching...")
        else:
            print(f"INFO: Starting processing with Mistral API (Pixtral-12B)...")
            print(f"INFO: Base delay between requests: {BASE_API_DELAY}s (with jitter)")
            print(f"INFO: Rate limit retry delay starts at: {RATE_LIMIT_DELAY}s (exponential backoff)")

        print(f"INFO: Processing ID range {start_id_actual} to {end_id_actual}")
        process_range(data, manual_start_row_position, manual_end_row_position, start_id_actual, end_id_actual)

    print("\n--- Script Finished ---")

INFO: Running with provided MISTRAL_API_KEY. Actual API calls will be made.
INFO: Using pre-existing DataFrame 'data' with 68604 rows.
INFO: Requested start from ID value 7103, which is at DataFrame position 7102.
INFO: Starting processing with Mistral API (Pixtral-12B)...
INFO: Base delay between requests: 0s (with jitter)
INFO: Rate limit retry delay starts at: 10s (exponential backoff)
INFO: Processing ID range 7103 to 10000

Processing range: IDs 7103 to 10000 (Total in this range: 2898)

Processing item 1/2898 in current range (ID: 7103)...
ID 7103 (Row 7103, Index Label: 7162): Extracted: 'politics'
ID 7104 (Row 7104, Index Label: 7163): Extracted: 'politics'
ID 7105 (Row 7105, Index Label: 7164): Extracted: 'politics'
ID 7106 (Row 7106, Index Label: 7165): Extracted: 'politics'
ID 7107 (Row 7107, Index Label: 7166): Extracted: 'politics'
ID 7108 (Row 7108, Index Label: 7167): Extracted: 'legal'
ID 7109 (Row 7109, Index Label: 7168): Extracted: 'politics'
ID 7110 (Row 7110, Index

In [None]:
!gdown --id 12YdHCYSO5Jj-smZIW0bVN2WHBHwOamR2 -O subjects_for_dataset.zip
!unzip subjects_for_dataset.zip

Downloading...
From: https://drive.google.com/uc?id=12YdHCYSO5Jj-smZIW0bVN2WHBHwOamR2
To: /content/subjects_for_dataset.zip
100% 168k/168k [00:00<00:00, 69.2MB/s]
Archive:  subjects_for_dataset.zip
   creating: subjects_for_dataset/
  inflating: subjects_for_dataset/subjects_id_10001_to_20000.csv  
  inflating: subjects_for_dataset/subjects_id_1_to_10000.csv  
  inflating: subjects_for_dataset/subjects_id_20001_to_25000.csv  
  inflating: subjects_for_dataset/subjects_id_25001_to_35000.csv  
  inflating: subjects_for_dataset/subjects_id_35001_to_45000.csv  
  inflating: subjects_for_dataset/subjects_id_45001_to_50000.csv  
  inflating: subjects_for_dataset/subjects_id_50001_to_53000.csv  
  inflating: subjects_for_dataset/subjects_id_53001_to_56000.csv  
  inflating: subjects_for_dataset/subjects_id_56001_to_60000.csv  
  inflating: subjects_for_dataset/subjects_id_60001_to_68604.csv  


In [None]:
import os
import glob
import re
import csv

SOURCE_DIRECTORY = "/content/subjects_for_dataset"
FILE_PATTERN = "subjects_id_*.csv"
OUTPUT_FILE = "concatenated_subjects.csv"

FILENAME_REGEX = r"subjects_id_(\d+)_to_(\d+)\.csv"

def get_start_id_from_filename(filename):
    """Trích xuất ID bắt đầu từ tên file."""
    match = re.search(FILENAME_REGEX, os.path.basename(filename))
    if match:
        return int(match.group(1))
    return float('inf')

def main():
    full_pattern_path = os.path.join(SOURCE_DIRECTORY, FILE_PATTERN)
    csv_files = glob.glob(full_pattern_path)

    if not csv_files:
        print(f"Không tìm thấy file CSV nào trong thư mục '{SOURCE_DIRECTORY}' khớp với mẫu '{FILE_PATTERN}'.")
        return

    csv_files.sort(key=get_start_id_from_filename)

    print(f"Tìm thấy {len(csv_files)} file để nối:")
    for f_path in csv_files:
        print(f"  - {os.path.basename(f_path)}")

    header_written = False

    with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as outfile:
        csv_writer = csv.writer(outfile)

        for i, filepath in enumerate(csv_files):
            try:
                with open(filepath, 'r', newline='', encoding='utf-8') as infile:
                    csv_reader = csv.reader(infile)

                    header = next(csv_reader)

                    if not header_written:
                        csv_writer.writerow(header)
                        header_written = True

                    for row in csv_reader:
                        csv_writer.writerow(row)
                print(f"Đã xử lý: {os.path.basename(filepath)}")
            except StopIteration:
                print(f"Cảnh báo: File {os.path.basename(filepath)} trống hoặc chỉ chứa tiêu đề. Bỏ qua phần thân file.")
            except Exception as e:
                print(f"Lỗi khi xử lý file {os.path.basename(filepath)}: {e}")

    if header_written:
        print(f"\nĐã nối thành công {len(csv_files)} file vào '{OUTPUT_FILE}'.")
    else:
        print(f"\nKhông có dữ liệu nào được ghi vào '{OUTPUT_FILE}'. Vui lòng kiểm tra các file đầu vào của bạn.")

if __name__ == "__main__":
    main()

Tìm thấy 10 file để nối:
  - subjects_id_1_to_10000.csv
  - subjects_id_10001_to_20000.csv
  - subjects_id_20001_to_25000.csv
  - subjects_id_25001_to_35000.csv
  - subjects_id_35001_to_45000.csv
  - subjects_id_45001_to_50000.csv
  - subjects_id_50001_to_53000.csv
  - subjects_id_53001_to_56000.csv
  - subjects_id_56001_to_60000.csv
  - subjects_id_60001_to_68604.csv
Đã xử lý: subjects_id_1_to_10000.csv
Đã xử lý: subjects_id_10001_to_20000.csv
Đã xử lý: subjects_id_20001_to_25000.csv
Đã xử lý: subjects_id_25001_to_35000.csv
Đã xử lý: subjects_id_35001_to_45000.csv
Đã xử lý: subjects_id_45001_to_50000.csv
Đã xử lý: subjects_id_50001_to_53000.csv
Đã xử lý: subjects_id_53001_to_56000.csv
Đã xử lý: subjects_id_56001_to_60000.csv
Đã xử lý: subjects_id_60001_to_68604.csv

Đã nối thành công 10 file vào 'concatenated_subjects.csv'.


In [None]:
subjects = pd.read_csv("/content/concatenated_subjects.csv")
subjects.head()

Unnamed: 0,id,subject_category
0,1,politics
1,2,socialIssues
2,3,politics
3,4,politics
4,5,business
