In [2]:
import pandas as pd
import re
from collections import Counter

# Paths to all CSV files you want to analyze
csv_files = [
    "gulf_arabic_transcripts_3_13_25.csv",
    "gulf_arabic_transcripts_3_13_25_with_transcript.csv",
    "egypt_arabic_transcripts_3_13_25_with_transcript.csv",
    "levant_output_with_ilr_3_12_25.csv",
    "levant_with_ilr_3_10_25_edit.csv",
    "maghrebi_arabic_transcripts_3_13_25_with_ilr.csv"
]

# Column you believe holds the transcripts/text
text_column = "transcript"

all_words = []

for file_path in csv_files:
    print(f"Reading {file_path} ...")
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        continue
    
    if text_column not in df.columns:
        print(f"'{text_column}' column not found in {file_path}, skipping.")
        continue
    
    # Collect text from the transcript column
    for text in df[text_column].dropna():
        # Convert to string in case of weird data types
        text = str(text)
        
        # Tokenize words by splitting on non-letter characters
        # This handles English or Arabic letters, ignoring punctuation, numbers, etc.
        tokens = re.findall(r"[a-zA-Z]+|[ء-ي]+", text, flags=re.IGNORECASE)
        
        # Add these tokens to our master list
        all_words.extend(tokens)

# 2. Count the frequency of each unique word
# Convert all words to a consistent case first (lowercase)
all_words_lower = [w.lower() for w in all_words]
word_counts = Counter(all_words_lower)

# 3. Build a DataFrame of word -> frequency
word_freq_df = pd.DataFrame(word_counts.items(), columns=["word", "frequency"])

# 4. Sort by frequency descending
word_freq_df.sort_values("frequency", ascending=False, inplace=True)

# 5. Write out to a CSV so you can inspect or edit
output_path = "offensive_candidate_words.csv"
word_freq_df.to_csv(output_path, index=False)
print(f"Word frequency list saved to {output_path}")


Reading gulf_arabic_transcripts_3_13_25.csv ...
Error reading gulf_arabic_transcripts_3_13_25.csv: [Errno 2] No such file or directory: 'gulf_arabic_transcripts_3_13_25.csv'
Reading gulf_arabic_transcripts_3_13_25_with_transcript.csv ...
Reading egypt_arabic_transcripts_3_13_25_with_transcript.csv ...
Reading levant_output_with_ilr_3_12_25.csv ...
Reading levant_with_ilr_3_10_25_edit.csv ...
Reading maghrebi_arabic_transcripts_3_13_25_with_ilr.csv ...
Word frequency list saved to offensive_candidate_words.csv


In [6]:
borderline_offensive_terms_arabic = [
    'اغتصاب', 'تحرش', 'تحقير', 'عاهرة', 'زنا', 'كافر', 'ديوث', 'حيوان', 'حمار',
    'كلب', 'وسخ', 'قذر', 'مجنون', 'غبي', 'تافه', 'أحمق', 'معتوه', 'فاسق', 'فاجر',
    'حقير', 'متخلف', 'زق', 'زب', 'كس', 'فرج', 'نكاح', 'شرموطة', 'لوطي', 'ملعون',
    'متسول', 'سارق', 'خبيث', 'وقح', 'منافق', 'كسلان', 'جاهل', 'مزعج', 'فاشل',
    'بذيء', 'ثرثار', 'كذاب', 'متملق', 'متسلق', 'متغطرس', 'متعجرف'
]



# Define a function to filter out rows that contain offensive words in the 'transcript' column
def filter_offensive_rows(file_path, offensive_words):
    # Load the CSV
    df = pd.read_csv(file_path, encoding="utf-8")

    # Ensure 'transcript' column exists
    if "transcript" in df.columns:
        # Remove rows where transcript contains any offensive word
        df_cleaned = df[~df["transcript"].astype(str).apply(lambda x: any(word in x for word in offensive_words))]

        return df_cleaned
    else:
        return None  # Return None if no 'transcript' column found

# Process all uploaded files
file_paths = [
    "cleaned_arabic_meetings_youtube_videos_with_transcript_with_ilr_cleaned.csv",
    "cleaned_arabic_socialize_youtube_videos_with_transcript_with_ilr_cleaned.csv",
    "egypt_arabic_transcripts_3_13_25_with_transcript_cleaned.csv",
    "gulf_arabic_transcripts_3_13_25_with_transcript_cleaned.csv",
    "levant_output_with_ilr_3_12_25.csv_cleaned",
    "levant_output_with_ilr_3_12_25 (Copy)_cleaned.csv",
    "levant_with_ilr_3_10_25_edit_cleaned.csv",
    "maghrebi_arabic_transcripts_3_13_25_with_ilr_cleaned.csv"
]

# Dictionary to store cleaned data
cleaned_dfs = {}

for path in file_paths:
    cleaned_df = filter_offensive_rows(path, borderline_offensive_terms_arabic)
    if cleaned_df is not None:
        cleaned_dfs[path] = cleaned_df

# Save cleaned versions
for path, df_cleaned in cleaned_dfs.items():
    cleaned_file_path = path.replace(".csv", "_cleaned.csv")
    df_cleaned.to_csv(cleaned_file_path, index=False, encoding="utf-8")

# Display confirmation
list(cleaned_dfs.keys())


['cleaned_arabic_meetings_youtube_videos_with_transcript_with_ilr.csv',
 'cleaned_arabic_socialize_youtube_videos_with_transcript_with_ilr.csv',
 'egypt_arabic_transcripts_3_13_25_with_transcript.csv',
 'gulf_arabic_transcripts_3_13_25_with_transcript.csv',
 'levant_output_with_ilr_3_12_25.csv',
 'levant_output_with_ilr_3_12_25 (Copy).csv',
 'levant_with_ilr_3_10_25_edit.csv',
 'maghrebi_arabic_transcripts_3_13_25_with_ilr.csv']

In [9]:
# Define the specific title to remove
title_to_remove = "صدمة زوجية: عندما يدخل رجل غريب حياة زوجتك في قصص"
# Process all uploaded files

df = pd.read_csv(file_paths, encoding="utf-8")
file_paths = [
    "cleaned_arabic_meetings_youtube_videos_with_transcript_with_ilr_cleaned.csv",
    "cleaned_arabic_socialize_youtube_videos_with_transcript_with_ilr_cleaned.csv",
    "egypt_arabic_transcripts_3_13_25_with_transcript_cleaned.csv",
    "gulf_arabic_transcripts_3_13_25_with_transcript_cleaned.csv",
    "levant_output_with_ilr_3_12_25.csv_cleaned",
    "levant_output_with_ilr_3_12_25 (Copy)_cleaned.csv",
    "levant_with_ilr_3_10_25_edit_cleaned.csv",
    "maghrebi_arabic_transcripts_3_13_25_with_ilr_cleaned.csv"
]
# Function to remove a row based on title column
def remove_specific_title(file_paths, title):
    df = pd.read_csv(file_paths, encoding="utf-8")

    # Ensure 'title' column exists
    if "title" in df.columns:
        df_cleaned = df[df["title"] != title]
        return df_cleaned
    else:
        return None  # Return None if no 'title' column found

# Process all uploaded files to remove the specified title
cleaned_dfs_title = {}

for path in file_paths:
    cleaned_df = remove_specific_title(path, title_to_remove)
    if cleaned_df is not None:
        cleaned_dfs_title[path] = cleaned_df

# Save cleaned versions without the specific title
for path, df_cleaned in cleaned_dfs_title.items():
    cleaned_file_path = path.replace(".csv", "_title_removed.csv")
    df_cleaned.to_csv(cleaned_file_path, index=False, encoding="utf-8")

# Display confirmation of cleaned files
list(cleaned_dfs_title.keys())

ValueError: Invalid file path or buffer object type: <class 'list'>

In [15]:
import pandas as pd

# Load the CSV file
file_path = "spanish_youtube_3_19_25_with_ilr_split2.csv"  # Change this to your actual file path
df = pd.read_csv(file_path)

# Split the DataFrame
df1 = df.iloc[:3000]  # First 7000 rows
df2 = df.iloc[3000:]  # The rest

# Save to new CSV files
df1.to_csv("spanish_youtube_3_19_25_with_ilr_split2a.csv", index=False)
df2.to_csv("spanish_youtube_3_19_25_with_ilr_split2b.csv", index=False)

print("CSV has been split into two files: split_part1.csv and split_part2.csv")


CSV has been split into two files: split_part1.csv and split_part2.csv
