In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('employee_background.csv')
print("Dataset loaded. First 5 rows:\n", df.head())

Dataset loaded. First 5 rows:
                 Name    Degree           Institution  Graduation_Year  \
0    Miss Julie Dunn       PhD                   MIT             2014   
1       Joel Jenkins  Bachelor    Harvard University             2010   
2     Jennifer Perez  Bachelor       Yale University             1996   
3  Jessica Hernandez    Master  University of Oxford             2014   
4       Lauren Smith    Master  Princeton University             1993   

          Employer   Job_Title                  Contact_Email  
0  Consulting Firm  Consultant  christophermartin@example.net  
1      Media House     Analyst             npitts@example.org  
2      Media House     Analyst    melissamitchell@example.com  
3      Finance Ltd     Manager          ypeterson@example.com  
4  Education Group  Consultant         martineric@example.org  


In [3]:
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Name               0
Degree             0
Institution        0
Graduation_Year    0
Employer           0
Job_Title          0
Contact_Email      0
dtype: int64


In [4]:
df['Graduation_Year'] = pd.to_numeric(df['Graduation_Year'], errors='coerce')

print("\nNaN values in Graduation_Year after conversion:", df['Graduation_Year'].isnull().sum())


NaN values in Graduation_Year after conversion: 0


In [5]:
current_year = 2025

df['Invalid_Graduation_Year'] = df['Graduation_Year'] > current_year

accredited_institutions = ['Harvard University', 'Stanford University', 'MIT', 'University of Oxford', 'University of Cambridge']

df['Institution_Valid'] = df['Institution'].isin(accredited_institutions)

In [6]:
df['Verification_Status'] = 'Verified'

df.loc[df['Invalid_Graduation_Year'] | ~df['Institution_Valid'], 'Verification_Status'] = 'Flagged'

In [7]:
df.to_csv('employee_background_validated.csv', index=False)
print("\nValidated dataset saved as 'employee_background_validated.csv'")
print("\nVerification status counts:\n", df['Verification_Status'].value_counts())


Validated dataset saved as 'employee_background_validated.csv'

Verification status counts:
 Verification_Status
Verified    25013
Flagged     24987
Name: count, dtype: int64
