***
This notebook contains the exercise solutions for the Text Classification Section of the Natural Language Processing Course. 
<br>
<br>
If you have any question refer to the Lecture **'Tutorial - How to complete the exercises'** in section 2 of the course.
<br>
<br>
**NOTE: Depending on your Python version and library versions, your code may be correct but it may fail the asserts in the Validation cells - if your code matches the one on the solutions, don't worry and consider your exercise correct.**
***


# Exercise 1 - Obtaining and Preparing Data

In [11]:
# Read the spam.csv file using pandas
# Store the file in an object named 
# spam_file
import pandas as pd
spam_file = pd.read_csv('./data/spam.csv')

    
# Lower case the text column  in the spam_file
# data frame
spam_file['text'] = spam_file.text.str.lower()

# Remove all punctuation from the text column
# in the spam_file data frame
import string
spam_file['text'] = spam_file['text'].apply(lambda x: 
    x.translate(
        str.maketrans('', '', string.punctuation)
    )
)

# Transform the label column into 1 and 0 by stating
# that the value should be 1 when label = 'spam' and 0
# otherwise
import numpy as np
spam_file['label'] = np.where(
    spam_file['label']=='spam',1,0
)

# Split the data into train and test, you can use
# using 80% of the data to train the algorithm

# Use the text column as X (features) and the label
# as y (target). 

# Name your object X_train, X_test, y_train, y_test
# respectively

# Use Random State = 20

# Hint: use sklearn train_test_split!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(spam_file.text, spam_file.label, test_size=0.2, random_state=20)
)

# Use tf-idf transformation on the X_train to obtain features
# for our algorithm

# Use min_df = 0.02 as argument for the tf-idf

# Rewrite the X_Train object into a dense (2D) format

# Name the object tfv

from sklearn.feature_extraction.text import (
    TfidfVectorizer
)

tfv = TfidfVectorizer(min_df = 0.02)
X_train = tfv.fit_transform(X_train).todense()  

# Validation - Exercise 1

In [12]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    TfidfVectorizer
)
import string

cv_val = TfidfVectorizer(min_df = 0.02)
assert_1 = pd.read_csv('./data/spam.csv')

try:
    spam_file
except NameError:
    raise NameError('Did you create the object spam_file?')
    
try:
    X_train
except NameError:
    raise NameError('Did you create the object X_train?')
    
try:
    X_test
except NameError:
    raise NameError('Did you create the object X_test?')
    
try:
    y_train
except NameError:
    raise NameError('Did you create the object y_train?')
    
try:
    y_test
except NameError:
    raise NameError('Did you create the object y_test?')

assert_1['text'] = assert_1.text.str.lower()
assert_1['text'] = assert_1['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

assert_1['label'] = np.where(
    assert_1['label']=='spam',1,0
)

X_train_val, X_test_val, y_train_val, y_test_val = (
    train_test_split(assert_1.text, assert_1.label, test_size=0.2, random_state=20)
)

X_train_val = cv_val.fit_transform(X_train_val).todense()  

assert(assert_1).equals(spam_file)
assert(pd.DataFrame(X_train)).equals(pd.DataFrame(X_train_val))
assert(X_test).equals(X_test_val)

print('Your code is correct!')

Your code is correct!


# Exercise 2 - Training Model

In [13]:
# Train a logistic Regression using X_train to predict
# y_train

# Use random state 1234

from sklearn.linear_model import LogisticRegression

lm = LogisticRegression(random_state=1234)
lm.fit(X_train, y_train)

# Using the trained logistic regression, predict
# the class of a sentence being spam on the
# test set 

# Remember to use transform to obtain the stored 
# tf-idf we've set up above!

# Save the predictions in a y_pred object
# Your predictions should have format 1/0 and not probability

y_pred = lm.predict(tfv.transform(X_test).todense())

# Compare the labels you have obtained from your model with
# the real labels and compute the accuracy

# Hint: You can compute accuracy using accuracy_score from sklearn
# or manually!

# Store the accuracy in a acc named object

acc = sum(y_pred==y_test)/len(y_pred)

# Validation - Exercise 2

In [14]:
from sklearn.linear_model import LogisticRegression
import pandas as pd

lm_val = LogisticRegression(random_state=1234)
lm_val.fit(X_train_val, y_train_val)

try:
    y_pred
except NameError:
    raise NameError('Did you create the object y_pred?')
    
try:
    acc
except NameError:
    raise NameError('Did you create the object acc?')

lm_val = LogisticRegression(random_state=1234)
lm_val.fit(X_train_val, y_train_val)

y_pred_val = lm_val.predict(cv.transform(X_test).todense())

acc_val = sum(y_pred_val==y_test_val)/len(y_pred_val)

assert(pd.DataFrame(y_pred)).equals(pd.DataFrame(y_pred_val))
assert(str(round(acc, 4)) == '0.9596')

print('Your code is correct!')

Your code is correct!


# Exercise 3 - Predicting New Sentences

In [15]:
# Store a sentence named 'FREE entry, CLICK HERE to receive yr free entry wr waiting fr you and we ned u in our free amzing website were yll get free stuff' 
# in an object named sent_1

sent_1 = 'FREE entry, CLICK HERE to receive yr free entry wr waiting fr you and we ned u in our free amzing website were yll get free stuff'

# Pass the sentence 1 through the pipeline we've created
# Don't forget to lower case the text and remove punctuation 
# to match the pipeline we've used above!

# Always rewrite the sent_1 object
sent_1 = sent_1.lower()
sent_1 =  sent_1.translate(
        str.maketrans('', '', string.punctuation)
)

# Transform sent_1 into tf-idf format using
# our trained tfidf

# name the object sent_1_tf
sent_1_tf = tfv.transform([sent_1])

# Predict the probability of our sentence
# being spam using the logistic regression
# we've trained

# name it probability_spam
probability_spam = lm_val.predict_proba(sent_1_tf)

# Validation - Exercise 3

In [16]:
try:
    sent_1
except NameError:
    raise NameError('Did you create the object sent_1?')
    
try:
    probability_spam
except NameError:
    raise NameError('Did you create the object probability_spam?')

sent_1_val = (
    'FREE entry, CLICK HERE to receive yr free entry wr waiting fr you and we ned u in our free amzing website were yll get free stuff'
).lower().translate(str.maketrans('', '', string.punctuation))

sent_1_tf_val = tfv.transform([sent_1_val])

probability_spam_val = lm_val.predict_proba(sent_1_tf)

assert(sent_1_val == sent_1)
assert(np.allclose(probability_spam,probability_spam_val))

print('Your code is correct!')

Your code is correct!
