In [65]:
import hashlib # for grading
import json

# Standard imports
import numpy as np
import pandas as pd
import re
import string
import math

# NLTK imports
import nltk
nltk.download('stopwords')
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# SKLearn related imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gugaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Exercise 1 - Price of cars (regex)
For the first question, you will be using regex to extract information from the `cars.txt` dataset. In this dataset, you'll find a list of cars that have been sold, as well as their brand, model and selling price.

Start by loading the data into a list. The list items are the lines of the file.

In [66]:
path = "data/cars.txt"
cars = []
with open(path, 'r', encoding='utf-8') as f:
    cars = [l.strip() for l in f.readlines()]

In [67]:
cars[:10]

['FORD -- Focus -- 19757',
 'LEXUS -- CT 200h -- 392',
 'JEEP -- Compass -- 22269',
 'CHEVROLET -- Captiva -- 7527',
 'SSANGYONG -- REXTON -- 34419',
 'TOYOTA -- Prius -- 251',
 'BMW -- 535 -- 23521',
 'BMW -- 328 -- 13485',
 'HYUNDAI -- Sonata -- 2901',
 'CHEVROLET -- Volt -- 3293']

In the first item, for example, `FORD` is the brand name, `Focus` is the model, and `19757` is the price.

### Exercise 1.1 - Find Toyota cars

First, we want to see which `TOYOTA` models have been sold. Find the items in the `cars` list which correspond to cars of the `TOYOTA` brand. Put these items into a list called `ans_1_1`.

In [68]:
# ans_1_1 = ...

# YOUR CODE HERE
ans_1_1 = re.findall(r"TOYOTA -- [\w\s]+ -- \d+", str(cars))
#First, we want to see which `TOYOTA` models have been sold. Find the items in the `cars` list which correspond to cars of the `TOYOTA` brand. Put these items into a list called `ans_1_1`.

In [69]:
assert isinstance(ans_1_1,list)
assert len(ans_1_1) == 111
assert hashlib.sha256(json.dumps(''.join(sorted(ans_1_1))).encode()).hexdigest() == \
'292f64bf78d500d2bb5dc13ca282edb571f1589b47ce0353f030de70c630608c', 'Not correct, try again.'

### Exercise 1.2 - Find models with numbers

Next, find the items in the `cars` list whose model is a set of numbers instead of characters. For example, `'BMW -- 535 -- 23521'`. Store these items in the list called `ans_1_2`.

In [70]:
# ans_1_2 = ...

# YOUR CODE HERE
ans_1_2 = []
pattern = re.compile(r'-- (\d+) --')

for car in cars:

    match = pattern.search(car)
    if match:
        ans_1_2.append(car)


In [71]:
assert isinstance(ans_1_2,list)
assert len(ans_1_2) == 73
assert hashlib.sha256(json.dumps(''.join(sorted(ans_1_2))).encode()).hexdigest() == \
'cdee14b87d092c0510a1cb9af05e8b9e96fc6a1c81489f7fd235e4a24e6b9119', 'Not correct, try again.'

### Exercise 1.3 - Selling price bellow 1000

Finally, get the car brands and models whose selling price is below 1000.

Save the results in the list `ans_1_3`. Each element in this list should be in the format `BRAND -- MODEL`.

In [72]:
# ans_1_3 = ...

# YOUR CODE HERE
ans_1_3 = []
a = re.findall(r'([A-Za-z\-]+ -- [\w\s\-]+) -- (\d+)', str(cars))
for car, price in a:
    if int(price) < 1000:
        ans_1_3.append(str(car))


In [73]:
assert len(ans_1_3) == 77
assert hashlib.sha256(json.dumps(''.join(sorted(ans_1_3))).encode()).hexdigest() == \
'4d1ef7339260bc7b24f98c0ea6031b4a6fff0f7a9e0100dfde1e17dbbe1308f9', 'Not correct, try again.'

## Exercise 2 - Job postings
The challenge of this exercise notebook is to classify job postings as 'Fake' or 'Real'. In this exercise, we'll be preprocessing the data.

Let's load the data:

In [74]:
df = pd.read_csv('data/job_postings.csv', index_col=0).convert_dtypes()

X = df['description']
y = df['fraudulent']

In [75]:
df.head()

Unnamed: 0,description,fraudulent
2395,"Marketeer  GR, I, Piraeus  nan  Social Medi...",0
512,"English Teacher Abroad US, UT, Logan  nan ...",0
14812,"UX Lead  NZ, N, Masterton  nan  nan  How a...",0
12516,"Customer Service Representative  US, IA, Cor...",0
5036,"Promotional Sales Representative  US, OH, Day...",0


Let's look at an example of a job description and its corresponding label:

In [76]:
df.iloc[10]['description']

"Inside Sales\r\n US, CA, Los Angeles\r\n 50000-55000\r\n GPL Technologies is a solutions provider focused on the design, implementation, and support of high-performance information technology systems.\xa0 Founded in Los Angeles, California in 2003, we draw on over a decade of expertise as trusted technology advisers, adding value for our customers by offering unique methods of improving IT efficiency, streamlining complex systems and environments, and reducing the costs associated with acquiring and maintaining IT systems.\xa0GPL cut its teeth serving the intense requirements of customers in the media and entertainment industry. \xa0We bring that work ethic with us to every customer: time is money, deadlines are non-negotiable, and the show must go on. \xa0Our company is comprised of creative, independent thinkers with a passion for technology. \xa0We love big data, fast networks, and solving the problems posed by today's digital media production pipelines.If serving clients who think

In [77]:
df.iloc[10]['fraudulent']

np.int64(0)

Let's check the data size and distribution of classes:

In [78]:
def get_data_stats(X, y):
    print(f"Size of dataset: {len(X)}")
    unique, counts = np.unique(y, return_counts=True)
    print(f"Distribution of classes: {dict(zip(unique, counts))}")

get_data_stats(X, y)

Size of dataset: 1913
Distribution of classes: {np.int64(0): np.int64(1047), np.int64(1): np.int64(866)}


The classes are evenly distributed. We'll use a dev and test set to be able to identify overfitting and check the performance on unseen data.

**Note**: So far you've used the `train`/`val`/`test` nomenclature for naming variables related to training, validation and test sets, respectively. `dev` is short for "development" and is just another typical identifier for the validation set, and we'll use it throughout this notebook instead of `val`.

In [79]:
# train dev test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp)
print(f"Train size: {len(X_train)}\nDev size: {len(X_dev)}\nTest size: {len(X_test)}")

Train size: 1530
Dev size: 268
Test size: 115


The first step in the workflow is preprocessing which we'll do in this exercise.

### Exercise 2.1 - Tokenization

Implement the function `apply_tokenizer`. The function should receive a pandas series of text data like `X_train` and an NLTK-style tokenizer. It should return the series with the tokenized text. The tokens of each item in the series should be joined into one string with spaces in between like in the example in the docstring.

In [80]:
def apply_tokenizer(data, tokenizer):
    """
    Tokenizes text data in the provided series using the provided tokenizer.
    
    E.g. for text data  "This is a test! No, it can't be"
         it returns "This is a test ! No , it can ' t be"
    
    Args:
    data - pd.Series with text data
    tokenizer - nltk tokenizer
    """
    
    # YOUR CODE HERE
    return data.apply(lambda x: ' '.join(tokenizer.tokenize(x)))

In [81]:
tokenizer = WordPunctTokenizer()
X_train_tok = apply_tokenizer(X_train, tokenizer)

assert isinstance(X_train_tok, pd.Series), 'The function should return a pandas series.'
assert len(X_train_tok) == 1530, 'The length of the series is not correct.'
assert isinstance(X_train_tok.iloc[0],str), 'The items of the series should be strings.'
assert hashlib.sha256(json.dumps(''.join([i for i in X_train_tok])).encode()).hexdigest() == \
'0e4a3d7fdab35e43079953d4d1328450b8b36454083f28966e3026e813eb3e3f', 'Not correct, try again.'

### Exercise 2.2 - Lowercasing

In the second step, implement a function that will lowercase the data.  It should take and return a pandas series.

In [82]:
def apply_lowercase(data):
    """
    Lowercases the text data in the provided pandas series.
    
    Args:
    data - pd.Series
    """

    # YOUR CODE HERE
    return data.str.lower()

In [83]:
X_train_tok_lc = apply_lowercase(X_train_tok)

assert isinstance(X_train_tok_lc, pd.Series), 'The output should be a pandas series.'
assert len(X_train_tok_lc) == 1530, 'The length of the output is not correct.'
assert isinstance(X_train_tok_lc.iloc[0],str), 'The items of the series should be strings.'
assert hashlib.sha256(json.dumps(''.join([i for i in X_train_tok_lc])).encode()).hexdigest() == \
'bd90f00143ed79470e7c4000aa5a800a0c0f68fe0e661eb5ee126fc0d2bb38ac', 'Not correct, try again.'

### Exercise 2.3 - Stopwords

Now implement a function that filters the stopwords from the text data. The function should take and return a pandas series. We will use NLTK's built-in English stopword list shown below.

In [84]:
stopword_list = stopwords.words('english')

In [85]:
def apply_filter_stopwords(data, stopword_list):
    """
    Removes stopwords from the provided pandas series with text data.
    
    Args:
    data - pd.Series
    stopword_list - list of stopwords to filter out
    """
    
    # YOUR CODE HERE
    return data.apply(lambda x: ' '.join([word for word in x.split() if word not in stopword_list]))

In [86]:
X_train_tok_lc_nosw = apply_filter_stopwords(X_train_tok_lc, stopword_list)

assert isinstance (X_train_tok_lc_nosw, pd.Series), 'The output should be a pandas series.'
assert len(X_train_tok_lc_nosw) == 1530, 'The length of the output is not correct.'
assert isinstance(X_train_tok_lc_nosw.iloc[0],str), 'The items of the series should be strings.'
assert hashlib.sha256(json.dumps(''.join([i for i in X_train_tok_lc_nosw])).encode()).hexdigest() == \
'13a80f1f4d7740b20cd3e2993ab73d37563212a8b208d3f554646a5783234e87', 'Not correct, try again.'

### Exercise 2.4 - Punctuation

After filtering the stopwords, we want to remove punctuation from the text. Consider only the punctuation characters in `string.punctuation`. Make sure to remove all punctuation and not only tokens that are single punctuation characters. 

In [106]:
def apply_filter_punct(data):
    """
    Removes punctuation from the provided pandas series with text data.
    
    Args:
    data - pandas series with text data
    """

    # YOUR CODE HERE

    remove_punct_trans = str.maketrans('', '', string.punctuation)
    
    data_filt = data.apply(lambda x: ' '.join(word.translate(remove_punct_trans) for word in x.split()))
    
    return data_filt


In [107]:
X_train_tok_lc_nosw_nopunct = apply_filter_punct(X_train_tok_lc_nosw)

**Normalize whitespaces**

Run the following function on `X_train_tok_lc_nosw_nopunct` before checking your answers to remove extra white spaces.

In [108]:
def normalize_whitespace(text):
    return re.sub(r"^\s+|\s+$|(?<=\s)\s*", "", text)

X_train_tok_lc_nosw_nopunct_norm = X_train_tok_lc_nosw_nopunct.apply(normalize_whitespace)

In [109]:
assert isinstance (X_train_tok_lc_nosw_nopunct_norm, pd.Series), 'The output should be a pandas series.'
assert len(X_train_tok_lc_nosw_nopunct_norm) == 1530, 'The length of the output is not correct.'
assert isinstance(X_train_tok_lc_nosw_nopunct_norm.iloc[0],str), 'The items of the series should be strings.'
assert hashlib.sha256(json.dumps(''.join([i for i in X_train_tok_lc_nosw_nopunct_norm])).encode()).hexdigest() == \
'a8273b9bae9dfb421af75eacc22bfd1b9c809ff36526060d756ffcbbed35dce9', 'Not correct, try again.'

### Exercise 2.5 - Stemming

The last preprocessing step that you are going to implement is stemming. Implement the function below to receive an NLTK-style stemmer and a pandas series with text data and return the series with the stemmed text.

In [110]:
def apply_stemmer(data, stemmer):
    """
    Stems the text data in the provided pandas series.
    
    Args:
    data - pd.Series
    stemmer - NLTK-style stemmer
    """

    # YOUR CODE HERE
    return data.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [111]:
stemmer = SnowballStemmer("english")
X_train_tok_lc_nosw_nopunct_norm_stem = apply_stemmer(X_train_tok_lc_nosw_nopunct_norm, stemmer)

assert isinstance (X_train_tok_lc_nosw_nopunct_norm_stem, pd.Series), 'The output should be a pandas series.'
assert len(X_train_tok_lc_nosw_nopunct_norm_stem) == 1530, 'The length of the output is not correct.'
assert isinstance(X_train_tok_lc_nosw_nopunct_norm_stem.iloc[0],str), 'The items of the series should be strings.'
assert hashlib.sha256(json.dumps(''.join([i for i in X_train_tok_lc_nosw_nopunct_norm_stem])).encode()).hexdigest() == \
'fb150190fd6957d636e470fb703db7a353e4d53fcdff4ef131431dac6461a895', 'Not correct, try again.'

### Exercise 2.6 - Everything together

Finally, join all the preprocessing steps from above into a transformer that applies the steps in the following order:
* tokenization
* lowercasing
* filtering stopwords
* filtering punctuation
* normalizing whitespace
* stemming.

Make use of the functions you designed above and don't forget to initialize all the necessary parameters.

In [118]:
class TextCleanerTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, tokenizer, stopwords, stemmer):
        self.tokenizer = tokenizer
        self.stopwords = stopwords
        self.stemmer = stemmer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = apply_tokenizer(X, self.tokenizer)
        X = apply_lowercase(X)
        X = apply_filter_stopwords(X, self.stopwords)
        X = apply_filter_punct(X)
        X = X.apply(normalize_whitespace)
        X = apply_stemmer(X, self.stemmer)
        return X

In [119]:
text_cleaner = TextCleanerTransformer(
    WordPunctTokenizer(),
    stopwords=stopwords.words('english'),
    stemmer=SnowballStemmer("english"),
)

X_train_pre = text_cleaner.fit_transform(X_train)

In [120]:
assert isinstance(X_train_pre, pd.Series), 'The output should be a pandas series.'
assert len(X_train_pre) == 1530, 'The length of the output is not correct.'
assert isinstance(X_train_pre.iloc[0],str), 'The items of the series should be strings.'
assert hashlib.sha256(json.dumps(''.join([i for i in X_train_pre])).encode()).hexdigest() == \
'fb150190fd6957d636e470fb703db7a353e4d53fcdff4ef131431dac6461a895', 'Not correct, try again.'

## Exercise 3 - Text classification

We will now classify the job postings as fake or real. Let's first load the preprocessed data and check the balance of the classes.

We are loading the preprocessed csv file here. This way you won't be penalized if you did not finish exercise 2.

In [121]:
def load_dataset(file_name):
    """
    Loads a csv file and returns two pandas series, one
    the text and one containing the labels
    
    Args:
    file_name: path to input file
    """
    df = pd.read_csv(file_name, index_col = 0)

    return df['description'], df['fraudulent']

In [122]:
X_train_pre, y_train = load_dataset('data/job_postings_train_preprocessed.csv')
X_dev_pre, y_dev = load_dataset('data/job_postings_dev_preprocessed.csv')
X_test_pre, y_test = load_dataset('data/job_postings_test_preprocessed.csv')

In [123]:
get_data_stats(X_train_pre, y_train)

Size of dataset: 1530
Distribution of classes: {np.int64(0): np.int64(837), np.int64(1): np.int64(693)}


In [124]:
get_data_stats(X_dev_pre, y_dev)

Size of dataset: 268
Distribution of classes: {np.int64(0): np.int64(147), np.int64(1): np.int64(121)}


So we should be aiming for much better than 45% accuracy, which is what we would get if we naively predicted `1` (fake) for everything.

### Exercise 3.1 - Top N-grams in the BoW vectorization

First, we'll look at the top X N-grams in each category to see if anything is interesting. Implement a function that returns the most common N-grams and their count for the given label in the dataset. Use the `CountVectorizer` to create the N-grams. The function should return a list of tuples of the form `(N-gram, count)`, sorted by the `count` in descending order.

In [126]:
def top_ngrams_for_category(data, labels, filter_label, top_n=10, ngram_size=1):
    """
    Finds the top_n N-grams in the BoW for the given class label.
    
    Args:
    data: pd.Series with text data
    labels: class labels for the data
    filter_label: the label to filter the data on before getting ngrams
    top_n: top n N-grams to return
    ngram_size: the "N" in N-gram (e.g. if ngram_size=2, return only bigrams)
    
    Returns: list of tuples (N-gram, count)
    """
    # Filter the data based on the given label
    filtered_data = data[labels == filter_label]
    
    # Create the CountVectorizer with the specified ngram_size
    vectorizer = CountVectorizer(ngram_range=(ngram_size, ngram_size))
    X = vectorizer.fit_transform(filtered_data)
    
    # Sum up the counts of each n-gram
    ngram_counts = X.sum(axis=0).A1
    ngram_names = vectorizer.get_feature_names_out()
    
    # Create a list of tuples (N-gram, count) and sort it by count in descending order
    ngram_freq = list(zip(ngram_names, ngram_counts))
    ngram_freq = sorted(ngram_freq, key=lambda x: x[1], reverse=True)
    
    return ngram_freq[:top_n]

In [127]:
top_10_unigrams_real = top_ngrams_for_category(X_train_pre, y_train, 0, top_n=10, ngram_size=1)
assert top_10_unigrams_real == [('nan', 3199),
                                ('work', 2689),
                                ('experi', 1966),
                                ('manag', 1854),
                                ('team', 1830),
                                ('servic', 1773),
                                ('develop', 1706),
                                ('custom', 1624),
                                ('compani', 1430),
                                ('time', 1428)]
top_6_unigrams_fake = top_ngrams_for_category(X_train_pre, y_train, 1, top_n=6, ngram_size=1)
assert top_6_unigrams_fake == [('nan', 3318),
                               ('work', 1828),
                               ('manag', 1303),
                               ('experi', 1287),
                               ('time', 1221),
                               ('servic', 1187)]
top_5_bigrams_real = top_ngrams_for_category(X_train_pre, y_train, 0, top_n=5, ngram_size=2)
assert top_5_bigrams_real == [('nan nan', 1349),
                              ('full time', 745),
                              ('custom servic', 398),
                              ('bachelor degre', 308),
                              ('year experi', 211)]
top_10_trigrams_fake = top_ngrams_for_category(X_train_pre, y_train, 1, top_n=10, ngram_size=3)
assert top_10_trigrams_fake == [('nan nan nan', 851),
                                ('nan full time', 165),
                                ('time nan nan', 138),
                                ('high school equival', 134),
                                ('full time nan', 131),
                                ('oil gas industri', 123),
                                ('time entri level', 116),
                                ('full time entri', 104),
                                ('level high school', 92),
                                ('mid senior level', 89)]

Looking at the top ngrams for each category, it doesn't seem like a BoW model will be very interesting, but let's try anyway.

### Exercise 3.2 - Modeling pipeline on BoW
Let's streamline our pipeline in a nice function. The function should set up the pipeline, fit the pipeline with the train data and predict on the dev data. It should also calculate the accuracy of the prediction and print the classification report.

The pipeline should have two steps, vectorization with sklearn `CountVectorizer` and classification with the `LogisticRegression`. Name the pipeline steps `vect` and `clf`. The `CountVectorizer` should take the given `ngram_range` and `max_features` parameter values.

The function should return the fitted pipeline, the prediction, and the accuracy of the prediction.

In [128]:
def train_and_validate(X_train, X_dev, y_train, y_dev, ngram_range=(1,1), max_features=None):
    """
    Train a model using sklearn's pipeline and return it along with the predictions and the
    accuracy in the validation set. Print the classification report as well.
    
    Args:
    X_train - preprocessed training data
    X_dev - preprocessed dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    
    Returns:
    text_clf - fitted pipeline
    y_dev_pred - prediction on the dev data (np.array)
    acc - accuracy of the prediction (float)
    """
    
    text_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=ngram_range, max_features=max_features)),
        ('clf', LogisticRegression(max_iter=1000))
    ])
    
    text_clf.fit(X_train, y_train)
    y_dev_pred = text_clf.predict(X_dev)
    acc = accuracy_score(y_dev, y_dev_pred)
    
    print(classification_report(y_dev, y_dev_pred))
    
    return text_clf, y_dev_pred, acc

In [129]:
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev)
assert isinstance(clf['vect'], CountVectorizer), 'The pipeline steps are not correct.'
assert isinstance(clf['clf'], LogisticRegression), 'The pipeline steps are not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in y_dev_pred])).encode()).hexdigest() == \
'36720a1e3d54d5a2d740cfe8cff6370341459fb18ecfbbddc04e5978ea0e7a7e', 'The prediction is not correct.'
np.testing.assert_almost_equal(acc, 0.906, decimal=3, err_msg="The accuracy is not correct.")

              precision    recall  f1-score   support

           0       0.95      0.88      0.91       147
           1       0.86      0.94      0.90       121

    accuracy                           0.91       268
   macro avg       0.91      0.91      0.91       268
weighted avg       0.91      0.91      0.91       268



We should look at some misclassified examples:

In [130]:
for text, pred, true in zip(X_dev_pre[:70], y_dev_pred[:70], y_dev[:70]):
    if pred != true:
        print(f"Job posting: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

Job posting: experienc line cook us sandwich nan belfri inn amp bistro award win b amp b restaur locat sandwich villag cape cod look profession line cook prepar food exact chef specif set station menu success candid play key role contribut custom satisfact acquisit goal look creativ profici cook aspect food prepar posit entitl competit wage plus benefit commensur experi pleas send resum prior menus creat cook avail respons includ set stock station necessari suppliesprepar food servic e g chop veget butcher meat prepar sauc cook menu item cooper rest kitchen staffansw report follow execut sous chef instructionsclean station take care leftov foodstock inventori appropriatelyensur food come simultan high qualiti time fashionmaintain posit profession approach cowork custom 3 year cook experienceexcel understand various cook method ingredi equip proceduresaccuraci speed execut assign tasksfamiliar industri best practic nan full time nan nan restaur custom servic nan
Predicted: 1, Actual: 0


So just with the simplest BoW model we already get an accuracy of 0.9! But let's see if we can do even better... In the misclassified examples, the last one even contains an application url but classified as 'fake'. That's suspicious...

### Exercise 3.3 - Tune hyperparameters
Run the function from the previous exercise for different N-gram ranges and/or with different values for max_features. Try to achieve an accuracy higher or equal to 0.9.

In [131]:
# clf_tuned, y_dev_pred_tuned, acc_tuned = train_and_validate(...)
# YOUR CODE HERE

ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features_list = [None, 5000, 10000]

best_acc = 0
best_clf = None
best_y_dev_pred = None

for ngram_range in ngram_ranges:
    for max_features in max_features_list:
        clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev, ngram_range=ngram_range, max_features=max_features)
        if acc > best_acc:
            best_acc = acc
            best_clf = clf
            best_y_dev_pred = y_dev_pred

clf_tuned = best_clf
y_dev_pred_tuned = best_y_dev_pred
acc_tuned = best_acc

print(f"Best accuracy: {acc_tuned}")

              precision    recall  f1-score   support

           0       0.95      0.88      0.91       147
           1       0.86      0.94      0.90       121

    accuracy                           0.91       268
   macro avg       0.91      0.91      0.91       268
weighted avg       0.91      0.91      0.91       268

              precision    recall  f1-score   support

           0       0.95      0.87      0.91       147
           1       0.86      0.94      0.90       121

    accuracy                           0.90       268
   macro avg       0.90      0.91      0.90       268
weighted avg       0.91      0.90      0.90       268

              precision    recall  f1-score   support

           0       0.95      0.87      0.91       147
           1       0.86      0.94      0.90       121

    accuracy                           0.90       268
   macro avg       0.90      0.91      0.90       268
weighted avg       0.91      0.90      0.90       268

              preci

In [132]:
assert(acc_tuned >= 0.90)
print(acc_tuned)

0.9216417910447762


Now evaluate your model on the test set!

In [133]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90        63
           1       0.86      0.92      0.89        52

    accuracy                           0.90       115
   macro avg       0.89      0.90      0.90       115
weighted avg       0.90      0.90      0.90       115



Great! We were able to improve the model a little on the dev set and the performance on the test set is pretty good!

## Exercise 4 - TF-IDF

We will now work with the TF-IDF vectorization of the preprocessed job postings data.

### Exercise 4.1 - Manual TF-IDF

First, implement a function that manually calculates the TF-IDF vectorization from a dataframe with a Bag of Words vectorization. Here is a reminder of the TF-IDF formula:

$$ tf\text{-}idf(t,d) = tf(t,d) \times idf(t) = \frac{n_{td}}{n_d} \times (log{(\frac{n+1}{df_t+1})} + 1 )$$

where $t$ is the term (word) for which we are calculating the weight, $d$ is the document containing the term, $n_{td}$ is the word count of the term $t$ in document $d$, $n_d$ is the number of words in document $d$, $n$ is the number of documents, and $df_t$ is the number of documents where the term appears.

In [134]:
# We'll start you off with the BoW representation, in pandas dataframe format
vec = CountVectorizer()
bow_train = vec.fit_transform(X_train_pre)
bow_train_df = pd.DataFrame(bow_train.todense())
vocab = vec.vocabulary_

In [135]:
def calculate_tfidf(bow_df):
    """
    Calculates the tfidf vectorization from a BoW vectorization.

    Args:
    bow_df - dataframe with document word counts (Bag of Words)

    Returns:
    tfidf - dataframe with the calculated tfidf
    """
    n_docs = bow_df.shape[0]
    df = (bow_df > 0).sum(axis=0)
    idf = np.log((n_docs + 1) / (df + 1)) + 1
    tf = bow_df.div(bow_df.sum(axis=1), axis=0)
    tf_idf = tf * idf
    return tf_idf

In [136]:
tfidf_df = calculate_tfidf(bow_train_df)
assert tfidf_df.shape==bow_train_df.shape, 'The shape of the tfidf dataframe is not correct.'
assert ((tfidf_df>0).sum()==(bow_train_df>0).sum()).sum()==bow_train_df.shape[1], ''
assert hashlib.sha256(json.dumps(''.join([str(round(i,3)) for i in tfidf_df[(tfidf_df>0)].sum().to_numpy()])).encode()).hexdigest() \
== 'cdeee674f9f9449ffb7a1cec047e75a0318af7836d0b94fe3948ce81e1ae4559', 'Not correct, try again.'

### Exercise 4.2 - Top words in the TF-IDF vectorization

Implement a function which returns the top N most important words for the given class in the TF-IDF vectorization, i.e. the words with the highest sum of weights. The function takes the dataframe with the TF-IDF weights calculated in the previous exercise and a vocabulary of words and feature indices from the CountVectorizer and returns a list of the top N most important words.

In [137]:
def top_words_for_category_tfidf(data, labels, filter_label, vocab, top_n=10):
    """
    Finds the top_n words in the TF-IDF for the given class label.
    
    Args:
    data: pd.Series with text data
    labels: class labels for the data
    filter_label: the label to filter the data on before getting ngrams
    vocab: vocabulary of words and feature indices
    top_n: top n ngrams to return
    
    Returns: list of top-n words
    """
    # Filter the data based on the given label
    filtered_data = data[labels == filter_label]
    
    # Sum the TF-IDF weights for each word
    word_sums = filtered_data.sum(axis=0)
    
    # Get the top N words with the highest sum of weights
    top_indices = word_sums.argsort()[-top_n:][::-1]
    top_words = [list(vocab.keys())[list(vocab.values()).index(i)] for i in top_indices]
    
    return top_words

In [138]:
top_15_fake = top_words_for_category_tfidf(tfidf_df, y_train, 1, vocab, top_n=15)
assert top_15_fake == ['nan',
                       'work',
                       'time',
                       'servic',
                       'entri',
                       'custom',
                       'posit',
                       'skill',
                       'amp',
                       'experi',
                       'home',
                       'manag',
                       'requir',
                       'administr',
                       'data']
top_20_real = top_words_for_category_tfidf(tfidf_df, y_train, 0, vocab, top_n=20)
assert top_20_real == ['nan',
                       'work',
                       'develop',
                       'manag',
                       'experi',
                       'custom',
                       'sale',
                       'team',
                       'servic',
                       'product',
                       'job',
                       'market',
                       'client',
                       'busi',
                       'design',
                       'compani',
                       'time',
                       'technolog',
                       'us',
                       'engin']

### Exercise 4.3 - Modeling pipeline on TF-IDF
Let's include the TF-IDF features into the pipeline. The function should set up the pipeline, fit the pipeline with the train data and predict on the dev data. It should also calculate the accuracy of the prediction and print the classification report.

The pipeline should have three steps: vectorization with sklearn `CountVectorizer`, transformation with sklearn `TfidfTransformer`, and classification with the `LogisticRegression`. Name the pipeline steps `vect`, `tfidf`, and `clf`. The `CountVectorizer` should take the given `ngram_range`, `max_features`, `max_df`, and `min_df` parameter values.

The function should return the fitted pipeline, the prediction, and the accuracy of the prediction.

In [140]:
def train_and_validate_with_tfidf(X_train, X_dev, y_train, y_dev, ngram_range=(1,1),
                                  max_features=None, max_df=1.0, min_df=1):
    """
    Train a model using sklearn's pipeline and return it along with the predictions and the
    accuracy in the validation set. Print the classification report as well.
    
    Args:
    X_train - preprocessed training data
    X_dev - preprocessed dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    max_df - minimum threshold for document frequency to use in CountVectorizer (int or float)
    min_df - maximum threshold for document frequency to use in CountVectorizer (int or float)
    
    Returns:
    text_clf - fitted pipeline
    y_dev_pred - prediction on the dev data (np.array)
    acc - accuracy of the prediction (float)
    """
    
    text_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=ngram_range, max_features=max_features, max_df=max_df, min_df=min_df)),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(max_iter=1000))
    ])
    
    text_clf.fit(X_train, y_train)
    y_dev_pred = text_clf.predict(X_dev)
    acc = accuracy_score(y_dev, y_dev_pred)
    
    print(classification_report(y_dev, y_dev_pred))
    
    return text_clf, y_dev_pred, acc

In [141]:
clf_tfidf, y_dev_pred_tfidf, acc_tfidf = train_and_validate_with_tfidf(X_train_pre, X_dev_pre, y_train, y_dev)
assert isinstance(clf_tfidf['vect'], CountVectorizer), 'The pipeline steps are not correct.'
assert isinstance(clf_tfidf['tfidf'], TfidfTransformer), 'The pipeline steps are not correct.'
assert isinstance(clf_tfidf['clf'], LogisticRegression), 'The pipeline steps are not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in y_dev_pred_tfidf])).encode()).hexdigest() == \
'475ecac8a2c6e2dea9aa06f0105f19aa20ead066339a6398af8d797bfeafbd0a', 'The prediction is not correct.'
np.testing.assert_almost_equal(acc_tfidf, 0.906, decimal=3, err_msg="The accuracy is not correct.")

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       147
           1       0.92      0.87      0.89       121

    accuracy                           0.91       268
   macro avg       0.91      0.90      0.91       268
weighted avg       0.91      0.91      0.91       268



As before, we should also look at some misclassified examples:

In [142]:
for text, pred, true in zip(X_dev_pre[:70], y_dev_pred_tfidf[:70], y_dev[:70]):
    if pred != true:
        print(f"Job posting: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

Job posting: data entri repres us tx austin 35000 40000 global connect world forc busi rethink communic mobil lifestyl flood technolog econom pressur alway ” mental creat opportun challeng organ size mitel ® nasdaq mitl tsx mnw global leader busi communic easili connect employe partner custom anywher anytim devic smallest busi largest enterpris mitel offer custom maximum choic one industri broadest portfolio best path cloud us 1 billion combin annual revenu 60 million custom worldwid 1 market share western europ mitel clear market leader busi communic respons review detail ticket ensur contain data requir custom monitor bin movement ensur activ seen match paperwork providedwatch contract level vs volum remain bin report discrep qualiti managerprovid ticket copi custom neededansw question custom may specif ticketsadvis plant manag miss paperworkcollect paperwork north south plant review bin inform includ sand pull correct bin bin movement line activ espons review detail ticket ensur con

Unfortunately, we're still not able to correctly classify those job postings. Let's keep going and see if we can do even better.

### Exercise 4.4 - Tune hyperparameters again

Use the `train_and_validate_with_tfidf` function you created before to train with different hyperparameters and get an accuracy score above 92% on the validation dataset. (This threshold is the same as what we got for the plain CountVectorizer pipeline).

In [143]:
# clf_tfidf_tuned, y_dev_pred_tfidf_tuned, acc_tfidf_tuned = train_and_validate_with_tfidf(...)
# YOUR CODE HERE
ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features_list = [None, 5000, 10000]
max_df_list = [0.75, 1.0]
min_df_list = [1, 2]

best_acc_tfidf = 0
best_clf_tfidf = None
best_y_dev_pred_tfidf = None

for ngram_range in ngram_ranges:
	for max_features in max_features_list:
		for max_df in max_df_list:
			for min_df in min_df_list:
				clf_tfidf, y_dev_pred_tfidf, acc_tfidf = train_and_validate_with_tfidf(
					X_train_pre, X_dev_pre, y_train, y_dev,
					ngram_range=ngram_range, max_features=max_features,
					max_df=max_df, min_df=min_df
				)
				if acc_tfidf > best_acc_tfidf:
					best_acc_tfidf = acc_tfidf
					best_clf_tfidf = clf_tfidf
					best_y_dev_pred_tfidf = y_dev_pred_tfidf

clf_tfidf_tuned = best_clf_tfidf
y_dev_pred_tfidf_tuned = best_y_dev_pred_tfidf
acc_tfidf_tuned = best_acc_tfidf

print(f"Best accuracy with TF-IDF: {acc_tfidf_tuned}")

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       147
           1       0.94      0.86      0.90       121

    accuracy                           0.91       268
   macro avg       0.91      0.91      0.91       268
weighted avg       0.91      0.91      0.91       268

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       147
           1       0.94      0.85      0.89       121

    accuracy                           0.91       268
   macro avg       0.91      0.90      0.90       268
weighted avg       0.91      0.91      0.91       268

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       147
           1       0.92      0.87      0.89       121

    accuracy                           0.91       268
   macro avg       0.91      0.90      0.91       268
weighted avg       0.91      0.91      0.91       268

              preci

In [144]:
assert(acc_tfidf_tuned > 0.92)
print(acc_tfidf_tuned)

0.9365671641791045


Now evaluate your model on the test set!

In [145]:
y_test_pred = clf_tfidf_tuned.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.71      0.98      0.83        63
           1       0.96      0.52      0.68        52

    accuracy                           0.77       115
   macro avg       0.84      0.75      0.75       115
weighted avg       0.83      0.77      0.76       115



Did the performace on the test set improve against the pure BoW pipeline? If not, the pipeline might require more tuning.

Congratulations to mastering the first NLP unit! This was no easy task, be proud of yourself!