In [1]:
import pandas as pd
import numpy
import spacy
import re
from collections import Counter

In [2]:
df = pd.read_csv("data job posts.csv")

In [3]:
df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [4]:
df.drop(columns=['AnnouncementCode', 'Audience', 'Notes', 'Attach', 'Term','Eligibility', 'IT','StartDate', 'Duration', 'Deadline', 'OpeningDate', 'ApplicationP', 'AboutC', 'Year', 'Month', 'Salary', 'date'], inplace=True)
df.drop_duplicates(inplace=True)
df = df.dropna(subset=['Title', 'JobDescription', 'RequiredQual'])
df.reset_index(drop=True, inplace=True)

In [5]:
df.head(10)
df.columns

Index(['jobpost', 'Title', 'Company', 'Location', 'JobDescription',
       'JobRequirment', 'RequiredQual'],
      dtype='object')

In [6]:
df["CleanText"] = (
    df["Title"].astype(str) + " " +
    df["JobDescription"].astype(str) + " " +
    df["JobRequirment"].astype(str) + " " +
    df["RequiredQual"].astype(str)
)

In [7]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)     # remove HTML tags
    text = re.sub(r'[^a-z0-9 ]', ' ', text) # remove special chars
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

for col in ["CleanText"]:
    df[col] = df[col].fillna('').apply(clean_text).apply(lemmatize_text)


In [9]:
df.head()

Unnamed: 0,jobpost,Title,Company,Location,JobDescription,JobRequirment,RequiredQual,CleanText
0,AMERIA Investment Consulting Company\r\nJOB TI...,Chief Financial Officer,AMERIA Investment Consulting Company,"Yerevan, Armenia",AMERIA Investment Consulting Company is seekin...,- Supervises financial management and administ...,"To perform this job successfully, an\r\nindivi...",chief financial officer ameria investment cons...
1,Caucasus Environmental NGO Network (CENN)\r\nJ...,Country Coordinator,Caucasus Environmental NGO Network (CENN),"Yerevan, Armenia",Public outreach and strengthening of a growing...,- Working with the Country Director to provide...,"- Degree in environmentally related field, or ...",country coordinator public outreach strengthen...
2,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,BCC Specialist,Manoff Group,"Manila, Philippines",The LEAD (Local Enhancement and Development fo...,- Identify gaps in knowledge and overseeing in...,"- Advanced degree in public health, social sci...",bcc specialist lead local enhancement developm...
3,"Boutique ""Appollo""\r\nJOB TITLE: Saleswoman\r...",Saleswoman,"Boutique ""Appollo""","Yerevan, Armenia\r\nWORK HOURS: 10:00 - 20:00;...",Saleswoman will sell menswear and accessories.,,"- Candidates should be female, 20-30 years old...",saleswoman saleswoman sell menswear accessory ...
4,OSI Assistance Foundation - Armenian Branch Of...,Chief Accountant/ Finance Assistant,OSI Assistance Foundation - Armenian Branch Of...,"Yerevan, Armenia",The Armenian Branch Office of the Open Society...,,- University degree in finance/ accounting; \r...,chief accountant finance assistant armenian br...


In [10]:
df.isnull().sum()

jobpost              0
Title                0
Company              0
Location            12
JobDescription       0
JobRequirment     1948
RequiredQual         0
CleanText            0
dtype: int64

In [12]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [13]:
df["word_count"] = df["CleanText"].apply(lambda x: len(x.split()))
print("Average words per posting:", df["word_count"].mean())
print("Max words in a posting:", df["word_count"].max())
print("Min words in a posting:", df["word_count"].min())

Average words per posting: 140.42578670747122
Max words in a posting: 1111
Min words in a posting: 10


In [14]:
all_words = " ".join(df["CleanText"])
word_freq = Counter(all_words.split())
print("Top 20 most common words:", word_freq.most_common(20))

Top 20 most common words: [('work', 32320), ('knowledge', 31326), ('experience', 29790), ('skill', 26507), ('ability', 20394), ('project', 19105), ('development', 17089), ('language', 16884), ('team', 14240), ('good', 14056), ('management', 13872), ('english', 13788), ('year', 12214), ('excellent', 12071), ('develop', 11686), ('communication', 11333), ('report', 11160), ('software', 10831), ('provide', 10725), ('armenian', 10488)]
