
# New Hugging Face Data Cleaning


## Setup

Pick one method to bring the CSV into Colab and set the paths.


In [None]:
import pandas as pd
import re
from collections import Counter
from datasets import load_dataset

In [None]:
#load dataset
ds = load_dataset("InferencePrince555/Resume-Dataset")
df = pd.DataFrame(ds['train'])  # Convert to pandas DataFrame
print("Initial shape:", df.shape)
display(df.head())

Initial shape: (32481, 3)


Unnamed: 0,instruction,input,Resume_test
0,Generate a Resume for a Accountant Job,,ACCOUNTANT Professional Summary Results orient...
1,Generate a Resume for a Accountant Job,,STAFF ACCOUNTANT Summary Flexible Accountant w...
2,Generate a Resume for a Accountant Job,,STAFF ACCOUNTANT Summary Highly analytical and...
3,Generate a Resume for a Accountant Job,,SENIOR ACCOUNTANT Summary A highly competent m...
4,Generate a Resume for a Accountant Job,,SENIOR ACCOUNTANT Summary 11 years experience ...


In [None]:
# Fill missing values
df['Resume_test'] = df['Resume_test'].fillna('')
df['instruction'] = df['instruction'].fillna('')
df['input'] = df['input'].fillna('')



In [None]:

# Combine into a single column
df['CleanText'] = df['instruction'].astype(str) + " " + df['input'].astype(str) + " " + df['Resume_test'].astype(str)


In [None]:
# Clean the text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)       # remove HTML tags
    text = re.sub(r'[^a-z0-9 ]', ' ', text) # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df['CleanText'] = df['CleanText'].apply(clean_text)


In [None]:
# Tokenization (for EDA / inspection)
df['tokens'] = df['CleanText'].apply(lambda x: x.split())
df['token_len'] = df['tokens'].apply(len)

print("Average tokens per resume:", df['token_len'].mean())
print("Max tokens:", df['token_len'].max())
print("Min tokens:", df['token_len'].min())


Average tokens per resume: 910.6359717988978
Max tokens: 12612
Min tokens: 8


In [None]:
# Optional: top 20 words
all_words = " ".join(df['CleanText'])
word_freq = Counter(all_words.split())
print("Top 20 most common words:", word_freq.most_common(20))

# 8️⃣ Save cleaned dataset
df.to_csv("Resume_HF_clean.csv", index=False)
print("Cleaned Hugging Face dataset saved as Resume_HF_clean.csv")


Top 20 most common words: [('and', 1670847), ('to', 786130), ('the', 657343), ('for', 511771), ('of', 497240), ('in', 487877), ('with', 311326), ('using', 269430), ('a', 266002), ('on', 207731), ('data', 182549), ('web', 170322), ('as', 146200), ('experience', 135532), ('application', 128957), ('management', 124817), ('development', 122145), ('used', 116935), ('project', 116639), ('database', 115087)]
Cleaned Hugging Face dataset saved as Resume_HF_clean.csv
