In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv('./archive/data_job_posts.csv')

In [3]:
df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   jobpost           19001 non-null  object
 1   date              19001 non-null  object
 2   Title             18973 non-null  object
 3   Company           18994 non-null  object
 4   AnnouncementCode  1208 non-null   object
 5   Term              7676 non-null   object
 6   Eligibility       4930 non-null   object
 7   Audience          640 non-null    object
 8   StartDate         9675 non-null   object
 9   Duration          10798 non-null  object
 10  Location          18969 non-null  object
 11  JobDescription    15109 non-null  object
 12  JobRequirment     16479 non-null  object
 13  RequiredQual      18517 non-null  object
 14  Salary            9622 non-null   object
 15  ApplicationP      18941 non-null  object
 16  OpeningDate       18295 non-null  object
 17  Deadline    

In [5]:
# Filling NaN values
columns_to_fill = ['JobDescription', 'JobRequirment', 'RequiredQual', 'Title', 'jobpost']
for column in columns_to_fill:
    df[column] = df[column].fillna('')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   jobpost           19001 non-null  object
 1   date              19001 non-null  object
 2   Title             19001 non-null  object
 3   Company           18994 non-null  object
 4   AnnouncementCode  1208 non-null   object
 5   Term              7676 non-null   object
 6   Eligibility       4930 non-null   object
 7   Audience          640 non-null    object
 8   StartDate         9675 non-null   object
 9   Duration          10798 non-null  object
 10  Location          18969 non-null  object
 11  JobDescription    19001 non-null  object
 12  JobRequirment     19001 non-null  object
 13  RequiredQual      19001 non-null  object
 14  Salary            9622 non-null   object
 15  ApplicationP      18941 non-null  object
 16  OpeningDate       18295 non-null  object
 17  Deadline    

In [7]:
# Combining important columns
df['Job'] = (df['Title'] + ' ' + df['jobpost'] + ' ' + df['JobDescription'] + ' ' + df['JobRequirment'] + ' ' + df['RequiredQual'])

In [8]:
df['Job'][0]

"Chief Financial Officer AMERIA Investment Consulting Company\r\nJOB TITLE:  Chief Financial Officer\r\nPOSITION LOCATION: Yerevan, Armenia\r\nJOB DESCRIPTION:   AMERIA Investment Consulting Company is seeking a\r\nChief Financial Officer. This position manages the company's fiscal and\r\nadministrative functions, provides highly responsible and technically\r\ncomplex staff assistance to the Executive Director. The work performed\r\nrequires a high level of technical proficiency in financial management\r\nand investment management, as well as management, supervisory, and\r\nadministrative skills.\r\nJOB RESPONSIBILITIES:  \r\n- Supervises financial management and administrative staff, including\r\nassigning responsibilities, reviewing employees' work processes and\r\nproducts, counseling employees, giving performance evaluations, and\r\nrecommending disciplinary action;\r\n- Serves as member of management team participating in both strategic\r\nand operational planning for the company;

# Preprocess 

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords from NLTK
nltk.download('stopwords') # Stop words like 'the', 'and', and 'I', because they don't provide meaniful information
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jriba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jriba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def preprocess_text(text: str) -> str:
    """
    Preprocesses the input text by cleaning and normalizing it. This includes removing carriage returns,
    new lines, punctuation, and numbers, converting to lowercase, tokenizing, and removing stopwords and non-alphabetic
    characters.

    Args:
        text (str): A string containing the text to be processed.

    Returns:
        str: The processed text, which is cleaned, tokenized, and stripped of stopwords and non-alphabetic
        characters, concatenated back into a single string.
    """

    # remove \r and \n
    text = re.sub(r'[\r\n]+', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)

    # tokenize
    words = word_tokenize(text)

    # Remove stopwords and non-alphabetic characters
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return ' '.join(words)

In [11]:
# Example outcome
preprocess_text(df['Job'][0])

'chief financial officer ameria investment consulting company job title chief financial officer position location yerevan armenia job description ameria investment consulting company seeking chief financial officer position manages companys fiscal administrative functions provides highly responsible technically complex staff assistance executive director work performed requires high level technical proficiency financial management investment management well management supervisory administrative skills job responsibilities supervises financial management administrative staff including assigning responsibilities reviewing employees work processes products counseling employees giving performance evaluations recommending disciplinary action serves member management team participating strategic operational planning company directs oversees companys financial management activities including establishing monitoring internal controls managing cash investments managing investment portfolio coll

In [12]:
df['processed_text'] = df['Job'].apply(preprocess_text)

In [13]:
df['processed_text'][100]

'political assistant steps american embassy yerevan announcement number job title political assistant steps work hours hoursweek note applicants family members usg employees officially assigned post chief mission authority must residing country required work andor residency permits eligible consideration us embassy yerevan armenia seeking individual position political assistant polecon section job description provides research reporting advisory related services broad scope sensitivity field political reporting follows issues involving armenian political events well human rights refugees national minorities women children rights obtains information prepares factual analytical reports copy complete position description listing duties responsibilities available human resources office contact number required qualifications note applicants instructed address selection criterion detailed specific comprehensive information supporting criteria university degree equivalent strong background hi

# Extract keywords

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
def extract_keywords(texts: pd.Series, num_keywords: int=10) -> list:
    """
    Extracts keywords from a pandas Series containing texts using the TF-IDF (Term Frequency - Inverse Document Frequency) method.
    This function initialized a TF-IDF vectorizer, fits it to the provided texts, and extracts the top features (keywords)
    based on their TF-IDF scores.

    Args:
        texts (pd.Series): A pandas Series of preprocessed text documents from which to extract keywords.
        num_keywords (int): The number of top keywords to extract from each document. Defaults to 10.

    Returns:
        list: A list of the top `num_keywords` extracted as the most relevant keywords from the texts.
    """

    # Initialize a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=num_keywords, stop_words='english')

    # Fit the model and transform the data
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

    # Extract the features names, which are the keywords
    feature_names = tfidf_vectorizer.get_feature_names_out()

    return feature_names.tolist()

In [16]:
# Testing out outcome
keywords = extract_keywords(df['processed_text'], 10)
print(keywords)

['ability', 'application', 'armenia', 'development', 'experience', 'job', 'knowledge', 'skills', 'website', 'work']


In [17]:
lst = []
lst.append(df['processed_text'][0])
lst

['chief financial officer ameria investment consulting company job title chief financial officer position location yerevan armenia job description ameria investment consulting company seeking chief financial officer position manages companys fiscal administrative functions provides highly responsible technically complex staff assistance executive director work performed requires high level technical proficiency financial management investment management well management supervisory administrative skills job responsibilities supervises financial management administrative staff including assigning responsibilities reviewing employees work processes products counseling employees giving performance evaluations recommending disciplinary action serves member management team participating strategic operational planning company directs oversees companys financial management activities including establishing monitoring internal controls managing cash investments managing investment portfolio col

In [18]:
keywords = extract_keywords(lst)
print(keywords)

['accounting', 'administrative', 'finance', 'financial', 'including', 'investment', 'management', 'practices', 'principles', 'skills']
