In [3]:
import pandas as pd
import numpy as np
import random

In [7]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate 1000 rows of data
n_rows = 1000

# Create lists for each column
ids = list(range(1, n_rows + 1))
names = [f"{random.choice(['John', 'Jane', 'Bob', 'Alice', 'Charlie', 'Diana', 'Ethan', 'Fiona', 'George', 'Hannah'])} {random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez'])}" for _ in range(n_rows)]
ages = np.random.randint(22, 65, n_rows)
departments = np.random.choice(['IT', 'HR', 'Finance', 'Marketing', 'Sales', 'Operations'], n_rows)
salaries = np.random.randint(30000, 120000, n_rows)

# Introduce some missing values (but keep Age and Salary as integers)
for i in range(n_rows):
    if random.random() < 0.02:  # 2% chance of missing name
        names[i] = None

In [9]:
# Create DataFrame
df = pd.DataFrame({
    'ID': ids,
    'Name': names,
    'Age': ages,
    'Department': departments,
    'Salary': salaries
})

# Introduce some duplicates
n_duplicates = 50
duplicate_indices = np.random.choice(n_rows, n_duplicates, replace=False)
for idx in duplicate_indices:
    dup_row = df.iloc[idx].copy()
    dup_row['ID'] = n_rows + idx + 1
    df = df._append(dup_row, ignore_index=True)

# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv('large_employee_data.csv', index=False)

print("CSV file 'large_employee_data.csv' with 1000+ rows has been generated.")
print(df.head())
print("\nDataFrame Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print(f"\nTotal number of rows: {len(df)}")

CSV file 'large_employee_data.csv' with 1000+ rows has been generated.
    ID              Name  Age Department  Salary
0  497  Charlie Martinez   45         IT   39866
1  395       Diana Smith   49    Finance   73108
2  590       Diana Davis   41    Finance   76486
3  918     Charlie Jones   58         HR   36971
4  654  George Rodriguez   26  Marketing  103445

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          1050 non-null   int64 
 1   Name        1026 non-null   object
 2   Age         1050 non-null   int32 
 3   Department  1050 non-null   object
 4   Salary      1050 non-null   int32 
dtypes: int32(2), int64(1), object(2)
memory usage: 32.9+ KB
None

Missing values:
ID             0
Name          24
Age            0
Department     0
Salary         0
dtype: int64

Total number of rows: 1050
