# Shortlister:
## Importing Necessary Dependencies:

In [68]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd

## OpenAI API setup [temporary]:

In [10]:
os.environ['OPENAI_API_KEY'] = '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'

## 1. Importing Data from *.pdf*:
- using langchain document_loaders and openai integration.

In [7]:
from langchain.document_loaders import PyMuPDFLoader


In [11]:
cv_file_path = os.path.join('sample_resume_shortlister', os.listdir('sample_resume_shortlister')[0])
cv_file_path

'sample_resume_shortlister\\00.pdf'

In [12]:
loader = PyMuPDFLoader(cv_file_path)
loader

<langchain.document_loaders.pdf.PyMuPDFLoader at 0x18d2f0f1870>

In [20]:
documents = loader.load()
documents[1] # list indexing depends on the pages present in the file

Document(page_content='Skills Developed: AIDL, Machine Learning, Deep Learning, Flask,\nComputer Vision\nPosition of Responsibility\nPR Head of Campus Life at my University\nLink\n• I used to handle marketing stuffs of non-technical events at\nuniversity levels as well as to bring up the sponsors that can fund\nthe events\n• Campus Life is the community that handles every non-technical\nand cultural events at my university\nTechnical Head at Student Community\n[TechWiz]\nLink\n• I have conducted many Technical events in my university at huge\nlevel. Also have conducted mini-hackathon named as\n"PAIR-A-THON" at my University\'s Technical Fest\nTechnical member of Google Developers\nStudents Club\nLink\n• I have helped GDSC community in many technical aspects\n• Also conducted Technical Event : ASJ 2022 (Android Study Jams\n2021)\n• I have 1 year of working experience in this community\n• Also was active content writer and blog writer\nAchievements/ Extracurricular Activities\n• HackRx 4

langchain.schema.document.Document

## 2. Data Extraction from documents:

In [42]:
documents[1].page_content

'Skills Developed: AIDL, Machine Learning, Deep Learning, Flask,\nComputer Vision\nPosition of Responsibility\nPR Head of Campus Life at my University\nLink\n• I used to handle marketing stuffs of non-technical events at\nuniversity levels as well as to bring up the sponsors that can fund\nthe events\n• Campus Life is the community that handles every non-technical\nand cultural events at my university\nTechnical Head at Student Community\n[TechWiz]\nLink\n• I have conducted many Technical events in my university at huge\nlevel. Also have conducted mini-hackathon named as\n"PAIR-A-THON" at my University\'s Technical Fest\nTechnical member of Google Developers\nStudents Club\nLink\n• I have helped GDSC community in many technical aspects\n• Also conducted Technical Event : ASJ 2022 (Android Study Jams\n2021)\n• I have 1 year of working experience in this community\n• Also was active content writer and blog writer\nAchievements/ Extracurricular Activities\n• HackRx 4.0 Finalists [Hackatho

In [43]:
# Data Extraction:::
data = [doc.page_content for doc in documents]

## 3. Text Cleaning:

In [45]:
## Pipeline : Text Cleaning
def clean_txt(text):
    # url remover:
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # email addresses remover: 
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Special Character remover:
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # lowercase conversion:
    text = text.lower().strip()

    return text

In [49]:
data[1]

'Skills Developed: AIDL, Machine Learning, Deep Learning, Flask,\nComputer Vision\nPosition of Responsibility\nPR Head of Campus Life at my University\nLink\n• I used to handle marketing stuffs of non-technical events at\nuniversity levels as well as to bring up the sponsors that can fund\nthe events\n• Campus Life is the community that handles every non-technical\nand cultural events at my university\nTechnical Head at Student Community\n[TechWiz]\nLink\n• I have conducted many Technical events in my university at huge\nlevel. Also have conducted mini-hackathon named as\n"PAIR-A-THON" at my University\'s Technical Fest\nTechnical member of Google Developers\nStudents Club\nLink\n• I have helped GDSC community in many technical aspects\n• Also conducted Technical Event : ASJ 2022 (Android Study Jams\n2021)\n• I have 1 year of working experience in this community\n• Also was active content writer and blog writer\nAchievements/ Extracurricular Activities\n• HackRx 4.0 Finalists [Hackatho

In [48]:
clean_txt(data[1])

'skills developed aidl machine learning deep learning flask\ncomputer vision\nposition of responsibility\npr head of campus life at my university\nlink\n i used to handle marketing stuffs of nontechnical events at\nuniversity levels as well as to bring up the sponsors that can fund\nthe events\n campus life is the community that handles every nontechnical\nand cultural events at my university\ntechnical head at student community\ntechwiz\nlink\n i have conducted many technical events in my university at huge\nlevel also have conducted minihackathon named as\npairathon at my universitys technical fest\ntechnical member of google developers\nstudents club\nlink\n i have helped gdsc community in many technical aspects\n also conducted technical event  asj  android study jams\n\n i have  year of working experience in this community\n also was active content writer and blog writer\nachievements extracurricular activities\n hackrx  finalists hackathon by bajaj finserv\n winner of hack srm  h

In [51]:
# Cleaning data:
cleaned_data = [clean_txt(text) for text in data]
# cleaned_data

---

## 4. Tokenization:

- Generation of tokens from cleaned text data

In [53]:
tokenized_data = [nltk.word_tokenize(text) for text in cleaned_data]
# tokenized_data

---

## 5. Stopwords Removal:

In [55]:
stop_words = set(stopwords.words('english'))
# stop_words

In [57]:
filtered_data = [[word for word in tokens if word not in stop_words] for tokens in tokenized_data]
# filtered_data

In [84]:
# filtered_data

---

## 6. Lemmatization:

In [58]:
lemmatizer = WordNetLemmatizer()
lemmatized_data = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_data]

In [85]:
# lemmatized_data

---
## 7. Joining Tokens back for text formation:

In [62]:
preprocessed_data = [" ".join(tokens) for tokens in lemmatized_data]
preprocessed_data[1]

'skill developed aidl machine learning deep learning flask computer vision position responsibility pr head campus life university link used handle marketing stuff nontechnical event university level well bring sponsor fund event campus life community handle every nontechnical cultural event university technical head student community techwiz link conducted many technical event university huge level also conducted minihackathon named pairathon university technical fest technical member google developer student club link helped gdsc community many technical aspect also conducted technical event asj android study jam year working experience community also active content writer blog writer achievement extracurricular activity hackrx finalist hackathon bajaj finserv winner hack srm hackathon project voteflicks award title best blockchain microsoft ai classroom series certification awarded completion complete javascript course srm axis intellect may certificate link hackrx finalist participa

---
## 8. Vectorization:

In [63]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

In [92]:
print(X)

  (0, 152)	0.03676605324097573
  (0, 116)	0.03676605324097573
  (0, 138)	0.026159349690626558
  (0, 29)	0.026159349690626558
  (0, 82)	0.03676605324097573
  (0, 186)	0.03676605324097573
  (0, 183)	0.03676605324097573
  (0, 40)	0.026159349690626558
  (0, 95)	0.03676605324097573
  (0, 16)	0.03676605324097573
  (0, 58)	0.03676605324097573
  (0, 55)	0.03676605324097573
  (0, 205)	0.026159349690626558
  (0, 23)	0.03676605324097573
  (0, 143)	0.07353210648195146
  (0, 119)	0.052318699381253116
  (0, 74)	0.03676605324097573
  (0, 22)	0.03676605324097573
  (0, 171)	0.03676605324097573
  (0, 177)	0.03676605324097573
  (0, 24)	0.03676605324097573
  (0, 215)	0.052318699381253116
  (0, 53)	0.052318699381253116
  (0, 162)	0.052318699381253116
  (0, 151)	0.10463739876250623
  :	:
  (1, 215)	0.044066644158638237
  (1, 53)	0.044066644158638237
  (1, 162)	0.044066644158638237
  (1, 151)	0.2203332207931912
  (1, 44)	0.08813328831727647
  (1, 68)	0.044066644158638237
  (1, 45)	0.044066644158638237
  (1, 

---