### Task 1.2: Use the datasets library from Hugging Face to download the arabic- generated-abstracts dataset directly into a Python environment (By Google Colab).

In [1]:
# !pip install datasets
# !pip install python-dotenv


In [2]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")
print(dataset)


DatasetDict({
    by_polishing: Dataset({
        features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
        num_rows: 2851
    })
    from_title: Dataset({
        features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
        num_rows: 2963
    })
    from_title_and_content: Dataset({
        features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
        num_rows: 2574
    })
})


### Task 1.3: Perform initial data exploration:
#### 1- Load and inspect the dataset structure (columns, data types).


In [4]:
# Inspect column names and data types for one split (e.g., 'by_polishing')
print("\nFeatures in 'by_polishing':")
print(dataset['by_polishing'].features)

# Check dataset info (shape, structure, statistics)
print("\nDataset info for 'by_polishing':")
print(dataset['by_polishing'])





Features in 'by_polishing':
{'original_abstract': Value('string'), 'allam_generated_abstract': Value('string'), 'jais_generated_abstract': Value('string'), 'llama_generated_abstract': Value('string'), 'openai_generated_abstract': Value('string')}

Dataset info for 'by_polishing':
Dataset({
    features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
    num_rows: 2851
})


#### 2- Check the distribution of the target variable (label: human vs. AI- generated) for dataset["by_polishing"].


In [5]:
# Choose one split (e.g., by_polishing)
split1 = dataset["by_polishing"]

# Count human-written abstracts
num_human = len(split1["original_abstract"])

# Count AI-generated abstracts (4 per row)
num_ai = len(split1["allam_generated_abstract"]) \
       + len(split1["jais_generated_abstract"]) \
       + len(split1["llama_generated_abstract"]) \
       + len(split1["openai_generated_abstract"])

print("Number of human abstracts:", num_human)
print("Number of AI-generated abstracts:", num_ai)

# Distribution ratio
total = num_human + num_ai
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

Number of human abstracts: 2851
Number of AI-generated abstracts: 11404
Human %: 20.0
AI %: 80.0


#### 3- Assess data quality: check for missing values, duplicates, and inconsistencies:


Missing values → any None/NaN in columns

Duplicates → same abstract appearing multiple times

Inconsistencies → like empty strings " " or unusual data

In [6]:
import pandas as pd
# Convert to pandas for easier checks
df = pd.DataFrame(split1)

# 1. Missing values
print("Missing values per column:")
print(df.isnull().sum())
print("_________________________________________")

# 2. Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Also check duplicates in each column separately
for col in df.columns:
    print(f"Duplicates in column {col}: {df[col].duplicated().sum()}")
print("_________________________________________")


# 3. Inconsistencies: empty strings or only spaces
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"Empty/blank values in column {col}: {empty_count}")


Missing values per column:
original_abstract            0
allam_generated_abstract     0
jais_generated_abstract      0
llama_generated_abstract     0
openai_generated_abstract    0
dtype: int64
_________________________________________

Number of duplicate rows: 0
Duplicates in column original_abstract: 0
Duplicates in column allam_generated_abstract: 0
Duplicates in column jais_generated_abstract: 0
Duplicates in column llama_generated_abstract: 0
Duplicates in column openai_generated_abstract: 0
_________________________________________
Empty/blank values in column original_abstract: 0
Empty/blank values in column allam_generated_abstract: 0
Empty/blank values in column jais_generated_abstract: 0
Empty/blank values in column llama_generated_abstract: 0
Empty/blank values in column openai_generated_abstract: 0


####Check the distribution of the target variable (label: human vs. AI- generated) for dataset["from_title"].

In [7]:
split2 = dataset["from_title"]

# Count human-written abstracts
num_human = len(split2["original_abstract"])

# Count AI-generated abstracts (4 per row)
num_ai = len(split2["allam_generated_abstract"]) \
       + len(split2["jais_generated_abstract"]) \
       + len(split2["llama_generated_abstract"]) \
       + len(split2["openai_generated_abstract"])

print("Number of human abstracts:", num_human)
print("Number of AI-generated abstracts:", num_ai)

# Distribution ratio
total = num_human + num_ai
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

Number of human abstracts: 2963
Number of AI-generated abstracts: 11852
Human %: 20.0
AI %: 80.0


In [8]:
import pandas as pd
# Convert to pandas for easier checks
df = pd.DataFrame(split2)

# 1. Missing values
print("Missing values per column:")
print(df.isnull().sum())
print("_________________________________________")

# 2. Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Also check duplicates in each column separately
for col in df.columns:
    print(f"Duplicates in column {col}: {df[col].duplicated().sum()}")
print("_________________________________________")


# 3. Inconsistencies: empty strings or only spaces
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"Empty/blank values in column {col}: {empty_count}")


Missing values per column:
original_abstract            0
allam_generated_abstract     0
jais_generated_abstract      0
llama_generated_abstract     0
openai_generated_abstract    0
dtype: int64
_________________________________________

Number of duplicate rows: 0
Duplicates in column original_abstract: 0
Duplicates in column allam_generated_abstract: 0
Duplicates in column jais_generated_abstract: 0
Duplicates in column llama_generated_abstract: 0
Duplicates in column openai_generated_abstract: 0
_________________________________________
Empty/blank values in column original_abstract: 0
Empty/blank values in column allam_generated_abstract: 0
Empty/blank values in column jais_generated_abstract: 0
Empty/blank values in column llama_generated_abstract: 0
Empty/blank values in column openai_generated_abstract: 0


####Check the distribution of the target variable (label: human vs. AI- generated) for dataset["from_title_and_content"].

In [9]:
split3 = dataset["from_title_and_content"]

# Count human-written abstracts
num_human = len(split3["original_abstract"])

# Count AI-generated abstracts (4 per row)
num_ai = len(split3["allam_generated_abstract"]) \
       + len(split3["jais_generated_abstract"]) \
       + len(split3["llama_generated_abstract"]) \
       + len(split3["openai_generated_abstract"])

print("Number of human abstracts:", num_human)
print("Number of AI-generated abstracts:", num_ai)

# Distribution ratio
total = num_human + num_ai
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

Number of human abstracts: 2574
Number of AI-generated abstracts: 10296
Human %: 20.0
AI %: 80.0


In [10]:
import pandas as pd
# Convert to pandas for easier checks
df = pd.DataFrame(split3)

# 1. Missing values
print("Missing values per column:")
print(df.isnull().sum())
print("_________________________________________")

# 2. Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Also check duplicates in each column separately
for col in df.columns:
    print(f"Duplicates in column {col}: {df[col].duplicated().sum()}")
print("_________________________________________")


# 3. Inconsistencies: empty strings or only spaces
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"Empty/blank values in column {col}: {empty_count}")


Missing values per column:
original_abstract            0
allam_generated_abstract     0
jais_generated_abstract      0
llama_generated_abstract     0
openai_generated_abstract    0
dtype: int64
_________________________________________

Number of duplicate rows: 0
Duplicates in column original_abstract: 0
Duplicates in column allam_generated_abstract: 0
Duplicates in column jais_generated_abstract: 3
Duplicates in column llama_generated_abstract: 0
Duplicates in column openai_generated_abstract: 0
_________________________________________
Empty/blank values in column original_abstract: 0
Empty/blank values in column allam_generated_abstract: 0
Empty/blank values in column jais_generated_abstract: 0
Empty/blank values in column llama_generated_abstract: 0
Empty/blank values in column openai_generated_abstract: 0
