In [18]:
import pandas as pd
import numpy as np

# Load the CSV files
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows of each DataFrame to verify
print("Training Data:")
print(train_df.head())

print("\nValidation Data:")
print(val_df.head())

print("\nTest Data:")
print(test_df.head())

# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum())

# Fill missing values in 'text' column with an empty string
train_df['text'].fillna('', inplace=True)
val_df['text'].fillna('', inplace=True)
test_df['text'].fillna('', inplace=True)

# Basic statistics for text length
train_df['text_length'] = train_df['text'].apply(len)
val_df['text_length'] = val_df['text'].apply(len)
test_df['text_length'] = test_df['text'].apply(len)

# Calculate mean, median, and standard deviation of text lengths
mean_length = train_df['text_length'].mean()
median_length = train_df['text_length'].median()
std_length = train_df['text_length'].std()

print(f"Mean text length: {mean_length}")
print(f"Median text length: {median_length}")
print(f"Standard deviation of text length: {std_length}")

# Filter resumes that are longer than the mean text length
long_resumes_df = train_df[train_df['text_length'] > mean_length]

# Display the first few rows of the filtered DataFrame
print("Resumes longer than mean text length:")
print(long_resumes_df.head())

# Add a feature for the number of unique words in the resume text
train_df['unique_words'] = train_df['text'].apply(lambda x: len(set(x.split())))
val_df['unique_words'] = val_df['text'].apply(lambda x: len(set(x.split())))
test_df['unique_words'] = test_df['text'].apply(lambda x: len(set(x.split())))

# Display the first few rows with the new feature
print("Training Data with Unique Words Feature:")
print(train_df.head())

# Group by job category and calculate the mean text length
category_group = train_df.groupby('job_category')['text_length'].mean().reset_index()

# Display the grouped data
print("Mean Text Length by Job Category:")
print(category_group)

# Save the manipulated DataFrame to a new CSV file
train_df.to_csv('train_modified.csv', index=False)
val_df.to_csv('val_modified.csv', index=False)
test_df.to_csv('test_modified.csv', index=False)

print("Modified data saved to new CSV files.")
training_data = pd.read_csv('train_modified.csv')
training_data
# testing_data = pd.read_csv('test_modified.csv')
# testing_data 
# validation_data = pd.read_csv('val_modified.csv')
# validation_data


Training Data:
      file_name                                               text format  \
0  23955183.pdf  finance analyst summary strategic and analytic...    pdf   
1  12669075.pdf  branch administrator objective obtain a challe...    pdf   
2  24544244.pdf  passenger services officer professional summar...    pdf   
3  18422164.pdf  hr specialist summary resultsdriven profession...    pdf   
4  16066857.pdf  senior executive chef executive profile to see...    pdf   

  job_category  
0      FINANCE  
1      APPAREL  
2     AVIATION  
3           HR  
4         CHEF  

Validation Data:
      file_name                                               text format  \
0  17555081.pdf  instructional designer summary dedicated effic...    pdf   
1  22546476.pdf  safety manager summary safety manager with 16 ...    pdf   
2  20417897.pdf  executive assistant hr summary skillful and de...    pdf   
3  29075857.pdf  adjunct instructor skill highlights quality en...    pdf   
4  26410763.pdf  

Unnamed: 0,file_name,text,format,job_category,text_length,unique_words
0,23955183.pdf,finance analyst summary strategic and analytic...,pdf,FINANCE,5330,370
1,12669075.pdf,branch administrator objective obtain a challe...,pdf,APPAREL,4618,318
2,24544244.pdf,passenger services officer professional summar...,pdf,AVIATION,5962,439
3,18422164.pdf,hr specialist summary resultsdriven profession...,pdf,HR,6086,394
4,16066857.pdf,senior executive chef executive profile to see...,pdf,CHEF,6006,321
...,...,...,...,...,...,...
1733,27330027.pdf,director finance projects summary organized re...,pdf,FINANCE,4727,291
1734,28942221.pdf,construction inspector professional summary co...,pdf,CONSTRUCTION,5022,357
1735,69764348.pdf,staff sergeant e5 pavement construction equip...,pdf,CONSTRUCTION,10478,629
1736,22496394.pdf,cad designer summary personable cad designerdr...,pdf,DESIGNER,5543,301


In [1]:
# creating a machine learning model using this data-set

In [19]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report
# from transformers import BertTokenizer, BertModel
# import torch

# # Load data
# data = pd.read_csv('train_modified.csv')
# X = data['text']
# y = data['job_category']

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Tokenization and embedding using BERT
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# def embed_text(text):
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
#     outputs = model(**inputs)
#     embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
#     return embeddings

# X_train_embeddings = [embed_text(text) for text in X_train]
# X_test_embeddings = [embed_text(text) for text in X_test]

# # Convert to arrays
# X_train_embeddings = np.vstack(X_train_embeddings)
# X_test_embeddings = np.vstack(X_test_embeddings)

# # Train a classifier
# clf = LogisticRegression(max_iter=1000)
# clf.fit(X_train_embeddings, y_train)

# # Predict and evaluate
# y_pred = clf.predict(X_test_embeddings)
# print(classification_report(y_test, y_pred))
