# TF-IDF (Term Frequency - Inverse Document Frequency)

TF = No. of rep of words in sentence / No. of words in sentence
IDF = log_e (no. of sentences / no. of sentences containing the word)

final_matrix = TF * IDF

- Advantages:
  - intuitive and simple implementation;
  - inputs are fixed size and based on vocab size;
  - word importance is being captured, thus some contextual information is present;


- Disadvantages:
  - sparsity still exists;
  - out of vocab problem still exists for validation dataset;
  

In [17]:
import os
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

lem = WordNetLemmatizer()
tfidf = TfidfVectorizer(max_features=100, ngram_range = (2,2))
stop_words = set(stopwords.words('english'))

In [18]:
# reading the data set and configuring it to be correct
data_dir = 'datasets'
dataset_name = 'spam.csv'
full_path = os.path.join(data_dir,dataset_name)

data = pd.read_csv(full_path, sep = ',',
                    encoding='ISO-8859-1')

data = data.iloc[: , :2]
data = data.rename(columns={'v1': 'label', 'v2': 'message'})
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [19]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [20]:
corpus = []

for i in range(0, len(data)):
    # Remove non-alphabetic characters and lower the text
    review = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    review = review.lower()
    review = review.split()

    # Perform POS tagging
    pos_tags = nltk.pos_tag(review)

    # Lemmatize words based on POS tags, exclude stopwords
    lemmatized_review = []
    for word, tag in pos_tags:
        if word not in stop_words:
            wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN  # Default to NOUN
            lemmatized_review.append(lem.lemmatize(word, pos=wordnet_pos))

    # Join processed words back into a sentence
    lemmatized_review = ' '.join(lemmatized_review)
    corpus.append(lemmatized_review)


In [21]:
X = tfidf.fit_transform(corpus).toarray()

In [23]:
# setting print options for us to see the whole final vector
np.set_printoptions(edgeitems=30, linewidth=1000,
                    formatter = dict(float = lambda x: '%.3g' % x))

In [24]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0