In [1]:
import pandas as pd
import numpy as np

# Load the CSV files
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows of each DataFrame to verify
print("Training Data:")
print(train_df.head())

print("\nValidation Data:")
print(val_df.head())

print("\nTest Data:")
print(test_df.head())

# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum())

# Fill missing values in 'text' column with an empty string
train_df['text'].fillna('', inplace=True)
val_df['text'].fillna('', inplace=True)
test_df['text'].fillna('', inplace=True)

# Basic statistics for text length
train_df['text_length'] = train_df['text'].apply(len)
val_df['text_length'] = val_df['text'].apply(len)
test_df['text_length'] = test_df['text'].apply(len)

# Calculate mean, median, and standard deviation of text lengths
mean_length = train_df['text_length'].mean()
median_length = train_df['text_length'].median()
std_length = train_df['text_length'].std()

print(f"Mean text length: {mean_length}")
print(f"Median text length: {median_length}")
print(f"Standard deviation of text length: {std_length}")

# Filter resumes that are longer than the mean text length
long_resumes_df = train_df[train_df['text_length'] > mean_length]

# Display the first few rows of the filtered DataFrame
print("Resumes longer than mean text length:")
print(long_resumes_df.head())

# Add a feature for the number of unique words in the resume text
train_df['unique_words'] = train_df['text'].apply(lambda x: len(set(x.split())))
val_df['unique_words'] = val_df['text'].apply(lambda x: len(set(x.split())))
test_df['unique_words'] = test_df['text'].apply(lambda x: len(set(x.split())))

# Display the first few rows with the new feature
print("Training Data with Unique Words Feature:")
print(train_df.head())

# Group by job category and calculate the mean text length
category_group = train_df.groupby('job_category')['text_length'].mean().reset_index()

# Display the grouped data
print("Mean Text Length by Job Category:")
print(category_group)

# Save the manipulated DataFrame to a new CSV file
train_df.to_csv('train_modified.csv', index=False)
val_df.to_csv('val_modified.csv', index=False)
test_df.to_csv('test_modified.csv', index=False)

print("Modified data saved to new CSV files.")
training_data = pd.read_csv('train_modified.csv')
training_data
testing_data = pd.read_csv('test_modified.csv')
testing_data 
validation_data = pd.read_csv('val_modified.csv')
validation_data
