In [156]:
import pandas as pd
from sklearn import preprocessing
import nltk
from nltk.corpus import stopwords
import string

In [157]:
data = pd.read_csv('listings_unitedstates_cleaned.csv')

In [158]:
data = data.drop(data.columns[[0]], axis = 1)

## Removing punctions

In [159]:
def remove_punctuation(text):
    # Replacing the punctuations with no space, which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # Return the text stripped of punctuation marks
    return text.translate(translator)

In [160]:
data['Description'] = data['Description'].apply(remove_punctuation)

In [161]:
data['Description'][0]

'Our Customer’s mission is to create groundbreaking sport innovations by making their products more sustainable building a creative and diverse global team and making a positive impact in communities where we live and work Their purpose is to bring inspiration and innovation to unite the world through sport to create a healthy planet active communities and an equal playing field for all  We are seeking an experienced Data Scientist on a contract basis to join our Customer’s Supply Chain organization  What you’ll do  Uses advanced mathematical and statistical concepts and theories to analyze and collect data and construct solutions to business problems Performs complex statistical analysis on experimental or business data to validate and quantify trends or patterns identified by business analysts Constructs predictive models algorithms and probability engines to support data analysis or product functions verifies model and algorithm effectiveness based on realworld results Integrates st

## Tokenize words

In [162]:
corpus = (" ".join(data['Description'].tolist())).lower()

In [163]:
tokenized_corpus = nltk.word_tokenize(corpus)

## Stopwords

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.


In [164]:
stop = stopwords.words('english')

In [165]:
custom_stop_words = ["analysis","data","science","complex","environment",
                     "preferred","qualifications","required","development","design",
                     "relevant","develop","tools","including","ability","business",
                     "scientist","quantitative","related",'’', 'analytics', 'new', 'year',
                    'solution', 'technology', 'working', 'strong', 'using', 'problem',
                    'role', 'us', 'company', 'modeling', 'help']

for w in custom_stop_words:
    if w not in stop:
        stop.append(w)
    

In [166]:
tokens_no_stopword = [word.lower() for word in tokenized_corpus if word.lower() not in stop]

## Lemmatization
Lemmatization tries to achieve a similar base “stem” for a word. However, what makes it different is that it finds the dictionary word instead of truncating the original word.

In [167]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

lemmatized_tokens = [lmtzr.lemmatize(token) for token in tokens_no_stopword]

## Word frequencies

In [168]:
fd = nltk.FreqDist(lemmatized_tokens)

# get the top words
top_words = []
for key, value in fd.items():
    top_words.append((key, value))

# sort the list by the top frequencies
top_words = sorted(top_words, key = lambda x:x[1], reverse = True)

In [169]:
top_words_20 = top_words[:20]
top_words_20 

[('experience', 4684),
 ('team', 3015),
 ('work', 2571),
 ('learning', 2244),
 ('model', 1875),
 ('machine', 1697),
 ('product', 1671),
 ('skill', 1669),
 ('year', 1308),
 ('solution', 1268),
 ('opportunity', 1258),
 ('customer', 1213),
 ('statistical', 1167),
 ('research', 1061),
 ('engineering', 1009),
 ('support', 963),
 ('knowledge', 957),
 ('project', 943),
 ('technique', 939),
 ('python', 931)]