In [5]:
import pandas as pd

# Dictionary containing CSV file paths grouped by language
csv_files = {
    "arabic": [
        "ILR_CSVs/arabic_news_articles_with_ilr_7_19_24_part1.csv",
        "ILR_CSVs/translated_arabic_news_07182024_part1.csv",
        "ILR_CSVs/translated_arabic_news_articles_with_ilr10012024_part1.csv",
        "ILR_CSVs/translated_arabic_news_articles_with_ilr10012024_part2.csv"
    ],
    "chinese": [
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part1.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part2.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part3.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part4.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part5.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part6.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part7.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part8.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part9.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part10.csv",
        "ILR_CSVs/chinese_news_articles_with_ilr20241119_part11.csv"
    ],
    "english": [
        "ILR_CSVs/english_news_articles_with_ilr7182024_part1.csv"
    ],
    "french": [
        "ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv",
        "ILR_CSVs/french_news_articles_with_ilr10012024_part2.csv",
        "ILR_CSVs/french_news_articles_with_ilr10012024_part3.csv",
        "ILR_CSVs/french_news_articles_with_ilr10012024_part4.csv",
        "ILR_CSVs/french_news_articles_with_ilr10012024_part5.csv",
        "ILR_CSVs/french_news_articles_with_ilr10012024_part6.csv",
        "ILR_CSVs/french_news_articles_with_ilr10012024_part7.csv",
        "ILR_CSVs/translated_french_news7192024_part1.csv"
    ],
    "german": [
        "ILR_CSVs/german_articles_with_ilr09202024_part1.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part2.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part3.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part4.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part5.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part6.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part7.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part8.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part9.csv",
        "ILR_CSVs/german_articles_with_ilr09202024_part10.csv",
        "ILR_CSVs/translated_german_news09092024_part1.csv",
        "ILR_CSVs/translated_german_news_part1.csv"
    ],
    "russian": [
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part1.csv",
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part2.csv",
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part3.csv",
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part4.csv",
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part5.csv",
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part6.csv",
        "ILR_CSVs/russian_news_articles_with_ilr10012024_part7.csv"
    ],
    "spanish": [
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part1.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part2.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part3.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part4.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part5.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part6.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr10012024_part7.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr7182024_part1.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr7182024_part2.csv",
        "ILR_CSVs/spanish_news_articles_with_ilr7192024_part1.csv",
        "ILR_CSVs/translated_spanish_news_07192024_part1.csv",
        "ILR_CSVs/translated_spanish_news_07192024_part2.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part1.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part2.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part3.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part4.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part5.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part6.csv",
        "ILR_CSVs/translated_spanish_news_articles_with_ilr10012024_part7.csv"
    ],
    "turkish": [
        "ILR_CSVs/turkish_news_part1_part1.csv",
        "ILR_CSVs/turkish_news_part1_part2.csv"
    ]
}

# Iterate over each language and its respective files
for language, files in csv_files.items():
    print(f"\n--- {language.upper()} ARTICLES ---\n")
    for file_path in files:
        try:
            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Print the file name and the head of the DataFrame
            print(f"File: {file_path}")
            print(df.head())
            print("\n")
        except Exception as e:
            print(f"Error loading {file_path}: {e}\n")

# List to store results
data = []

# Iterate over each language and its respective files
for language, files in csv_files.items():
    for file_path in files:
        try:
            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Collect the file name and the head of the DataFrame
            for _, row in df.head().iterrows():
                data.append({"language": language, "file": file_path, **row.to_dict()})
        except Exception as e:
            data.append({"language": language, "file": file_path, "error": str(e)})

# Save the results to a new CSV file
output_df = pd.DataFrame(data)
output_df.to_csv("aggregated_results.csv", index=False)

print("Results saved to aggregated_results.csv")



--- ARABIC ARTICLES ---

File: ILR_CSVs/arabic_news_articles_with_ilr_7_19_24_part1.csv
                                               title  \
0  موعد مباراة إيطاليا وسويسرا في ثمن نهائي يورو ...   
1  موعد مباراة تركيا والنمسا في ثمن نهائي يورو 20...   
2  موعد مباراة إيطاليا وكرواتيا في يورو 2024 والق...   
3  موعد مباراة البرتغال ضد سلوفينيا في يورو 2024 ...   
4  شركة Corsair تكشف عن كرسي TC500 LUXE E-Sports ...   

                                             summary  \
0  نستعرض في السطور التالية موعد مباراة إيطاليا ض...   
1  تلعب تركيا مع النمسا في ثمن نهائي بطولة أمم أو...   
2  في الجولة الأخيرة من دور المجموعات لكأس أمم أو...   
3  نسلط الضوء في السطور التالية على موعد مباراة م...   
4  هذا الموضوع شركة Corsair تكشف عن كرسي TC500 LU...   

                                                text language  \
0  () ()\n2024 .\n 29 / .\n 19:00 .\n " " 2024 4 ...   Arabic   
1  () ()\n14 / .\n " " 1- 3-1 3-2 .\nlist of 2 it...   Arabic   
2  22:00 ()\n24/6/2024-| : 24/6/202403:22 

In [9]:

# List of French CSV files to parse

french_csv_files = [

    "ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv",

    "ILR_CSVs/french_news_articles_with_ilr10012024_part2.csv",

    "ILR_CSVs/french_news_articles_with_ilr10012024_part3.csv",

    "ILR_CSVs/french_news_articles_with_ilr10012024_part4.csv",

    "ILR_CSVs/french_news_articles_with_ilr10012024_part5.csv",

    "ILR_CSVs/french_news_articles_with_ilr10012024_part6.csv",

    "ILR_CSVs/french_news_articles_with_ilr10012024_part7.csv",

    "ILR_CSVs/translated_french_news7192024_part1.csv"

]



# Dictionary to store column names for each file

french_columns = {}



# Parse each file and store the column names

for file_path in french_csv_files:

    try:

        df = pd.read_csv(file_path, nrows=0)  # Read only the header for efficiency

        french_columns[file_path] = df.columns.tolist()

    except Exception as e:

        french_columns[file_path] = f"Error: {e}"



french_columns        

{'ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv': ['title',
  'summary',
  'text',
  'translated',
  'link',
  'french',
  'Unnamed: 6',
  'ilr_quantized',
  'ilr_range',
  'ilr_regressed'],
 'ILR_CSVs/french_news_articles_with_ilr10012024_part2.csv': ['title',
  'summary',
  'text',
  'translated',
  'link',
  'french',
  'Unnamed: 6',
  'ilr_quantized',
  'ilr_range',
  'ilr_regressed'],
 'ILR_CSVs/french_news_articles_with_ilr10012024_part3.csv': ['title',
  'summary',
  'text',
  'translated',
  'link',
  'french',
  'Unnamed: 6',
  'ilr_quantized',
  'ilr_range',
  'ilr_regressed'],
 'ILR_CSVs/french_news_articles_with_ilr10012024_part4.csv': ['title',
  'summary',
  'text',
  'translated',
  'link',
  'french',
  'Unnamed: 6',
  'ilr_quantized',
  'ilr_range',
  'ilr_regressed'],
 'ILR_CSVs/french_news_articles_with_ilr10012024_part5.csv': ['title',
  'summary',
  'text',
  'translated',
  'link',
  'french',
  'Unnamed: 6',
  'ilr_quantized',
  'ilr_range',
  'ilr_reg

In [11]:
import pandas as pd

# List of French CSV files
french_csv_files = [
    "ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part2.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part3.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part4.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part5.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part6.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part7.csv",
    "ILR_CSVs/translated_french_news7192024_part1.csv"
]

# Process each file
for file_path in french_csv_files:
    try:
        # Load the CSV
        df = pd.read_csv(file_path)
        
        # Rename the column if it exists
        if "translated" in df.columns:
            df.rename(columns={"translated": "translated_summary"}, inplace=True)
            print(f"Renamed 'translated' to 'translated_summary' in {file_path}")
        
        # Save back to the same file (or a new file if needed)
        df.to_csv(file_path, index=False)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


Error processing ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv: Error tokenizing data. C error: EOF inside string starting at row 1233
Renamed 'translated' to 'translated_summary' in ILR_CSVs/french_news_articles_with_ilr10012024_part2.csv
Renamed 'translated' to 'translated_summary' in ILR_CSVs/french_news_articles_with_ilr10012024_part3.csv
Renamed 'translated' to 'translated_summary' in ILR_CSVs/french_news_articles_with_ilr10012024_part4.csv
Renamed 'translated' to 'translated_summary' in ILR_CSVs/french_news_articles_with_ilr10012024_part5.csv
Renamed 'translated' to 'translated_summary' in ILR_CSVs/french_news_articles_with_ilr10012024_part6.csv
Renamed 'translated' to 'translated_summary' in ILR_CSVs/french_news_articles_with_ilr10012024_part7.csv


In [13]:
import pandas as pd

# List of French CSV files
french_csv_files = [
    "ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part2.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part3.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part4.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part5.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part6.csv",
    "ILR_CSVs/french_news_articles_with_ilr10012024_part7.csv",
    "ILR_CSVs/translated_french_news7192024_part1.csv"
]

# Process each file
for file_path in french_csv_files:
    try:
        # Load the CSV
        df = pd.read_csv(file_path)
        
        # Rename the column if it exists
        if "translated" in df.columns:
            df.rename(columns={"translated": "translated_summary"}, inplace=True)
            print(f"Renamed 'translated' to 'translated_summary' in {file_path}")
        
        # Save back to the same file (or a new file if needed)
        df.to_csv(file_path, index=False)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


Error processing ILR_CSVs/french_news_articles_with_ilr10012024_part1.csv: Error tokenizing data. C error: EOF inside string starting at row 1233
