In [None]:
#  AI/ML (Python + NLP) — Week 2: Text Preprocessing

**Assignment:**  
1. Collect a sample dataset of user skills + task descriptions.  
2. Preprocess text: remove punctuation & lowercase, tokenize, remove stopwords, apply stemming/lemmatization.  
3. Generate embeddings (OpenAI or Hugging Face).

**Deliverables:**  
- `Week2_Text_Preprocessing.ipynb` (this notebook)  
- `processed_skills_single.csv`  
- `processed_skills_single.json`

**Example given in assignment:**  
Input: *"Looking for a React Native developer with Firebase skills"*  
Output: *["react", "native", "developer", "firebase", "skill"]*


In [12]:
# Taking my first step on the data given 
dataset = [
    {"id": 1, "text": "Looking for a React Native developer with Firebase skills"}
]

# Previewing dataset
for d in dataset:
    print(d["id"], ":", d["text"])


1 : Looking for a React Native developer with Firebase skills


In [13]:
# Cleaning the text
import re  # regular expressions

def clean_text(s):
    # Lowercase
    s = s.lower()
    # Removing punctuation and special characters but keeping letters,numbers and spaces
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    # Remove extra spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Test cleaning on dataset
cleaned = clean_text(dataset[0]["text"])
print("Original:", dataset[0]["text"])
print("Cleaned :", cleaned)


Original: Looking for a React Native developer with Firebase skills
Cleaned : looking for a react native developer with firebase skills


In [14]:
# Tokenization
def tokenize(s):
    return s.split()  # split on spaces

tokens = tokenize(cleaned)
print("Tokens:", tokens)


Tokens: ['looking', 'for', 'a', 'react', 'native', 'developer', 'with', 'firebase', 'skills']


In [15]:
# Removing stopwords, defining a small stopword set 
STOPWORDS = {"for", "a", "the", "and", "in", "of", "with"}

def remove_stopwords(tokens):
    return [t for t in tokens if t not in STOPWORDS]

tokens_nostop = remove_stopwords(tokens)
print("Without stopwords:", tokens_nostop)


Without stopwords: ['looking', 'react', 'native', 'developer', 'firebase', 'skills']


In [16]:
#Stemming
import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()

# Applying stemming to the tokens without stopwords
stems = [ps.stem(t) for t in tokens_nostop]
print("Stemmed tokens:", stems)


Stemmed tokens: ['look', 'react', 'nativ', 'develop', 'firebas', 'skill']


In [20]:
import spacy
nlp = spacy.load("en_core_web_sm")
def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

lemmas = lemmatize_tokens(tokens_nostop)
print("Lemmatized tokens:", lemmas)


Lemmatized tokens: ['look', 'react', 'native', 'developer', 'firebase', 'skill']


In [21]:
import csv, json

processed = {
    "id": 1,
    "original": "Looking for a React Native developer with Firebase skills",
    "final_tokens": ['look', 'react', 'native', 'developer', 'firebase', 'skill']
}

# Saving to CSV
csv_path = "processed_skills_single.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "original", "final_tokens"])
    writer.writerow([processed["id"], processed["original"], json.dumps(processed["final_tokens"], ensure_ascii=False)])

# Saving to JSON
json_path = "processed_skills_single.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(processed, f, ensure_ascii=False, indent=2)

print(f"Saved files: {csv_path}, {json_path}")


Saved files: processed_skills_single.csv, processed_skills_single.json
