Importing Libraries along with our Data
Expanding Contractions
Language Detection
Tokenization
Converting all Characters to Lowercase
Removing Punctuations
Removing Stopwords
Parts of Speech Tagging
Lemmatization

In [None]:
Importing the Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import fasttext
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
plt.xticks(rotation=70)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

Importing our Data

In [None]:
with open('indeed_scrape.csv') as f:
    df = pd.read_csv(f)
f.close()

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
for col in df.columns:
    print(col, df[col].isnull().sum())

In [None]:
rws = df.loc[:, ['rating', 'rating_description']]

In [None]:
rws['no_contract'] = rws['rating_description'].apply(lambda x: [contractions.fix(word) for word in x.split()])
rws.head()

In [None]:
rws['rating_description_str'] = [' '.join(map(str, l)) for l in rws['no_contract']]
rws.head()

In [None]:
English Language Detection

In [None]:
pretrained_model = "lid.176.bin" 
model = fasttext.load_model(pretrained_model)
langs = []
for sent in rws['rating_description_str']:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
rws['langs'] = langs

In [None]:
Now all we have to do is remove any non-english reviews.

Tokenization
Now that we have removed any non-English reviews let’s apply our tokenizer in order to split each individual word into a token. We will apply NLTK.word_tokenize() function to the “rating_description_str” column and create a new column named “tokenized”.

In [None]:
rws['tokenized'] = rws['rating_description_str'].apply(word_tokenize)
rws.head()


In [1]:
# Converting all Characters to Lowercase

In [None]:
rws['lower'] = rws['tokenized'].apply(lambda x: [word.lower() for word in x])
rws.head()

In [None]:
# Removing Punctuations

In [None]:
punc = string.punctuation
rws['no_punc'] = rws['lower'].apply(lambda x: [word for word in x if word not in punc])
rws.head()

In [None]:
# Removing Stopwords
stop_words = set(stopwords.words('english'))
rws['stopwords_removed'] = rws['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
rws.head()

In [None]:
# Stemming vs Lemmatization
rws['pos_tags'] = rws['stopwords_removed'].apply(nltk.tag.pos_tag)
rws.head()

In [None]:
# apply NLTK’s word lemmatizer.
#apply NLTK’s word lemmatizer.
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
rws['wordnet_pos'] = rws['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
rws.head()

In [None]:
# Now we can apply NLTK’s word lemmatizer within our trusty list comprehension. Notice, the lemmatizer function requires two parameters the word and its tag (in wordnet form).
wnl = WordNetLemmatizer()
rws['lemmatized'] = rws['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
rws.head()

In [None]:

#Lastly, we are going to save this work into a csv file for further exploratory data analysis which you can read all about in my next blog.
rws.to_csv('indeed_scrape_clean.csv')