# TF IDF embedding

This notebook does the preprocessing and cleaning of text files through the Cleaner class

The TfIdfEmbedder class calculates the tfidf embeddings for a given matrix

The commented parts explain extra functionality


In [1]:
import numpy as np
import pandas as pd
import csv
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler

# extra
# import time
import torch
# import matplotlib.pyplot as plt

#to save and load objects
import pickle





In [2]:
# If we use PyTorch later
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [3]:
#Needed for stopwords, and lemmatization
import nltk

nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords

stop_words_english = stopwords.words('english')

# !pip install stanza
import stanza
stanza.download('en', verbose=False)

In [4]:
#Loading training and testing data
train_df = pd.read_csv("train_2024.csv", quoting=csv.QUOTE_NONE)
val_df = pd.read_csv("dev_2024.csv", quoting=csv.QUOTE_NONE)

Text_train = train_df["text"].values
y_train = train_df["label"].values

Text_val = val_df["text"].values
y_val = val_df["label"].values



In [5]:
"""
Cleaner class

Preprocesses the training data stored in the cleaned atribute. Tokenizes and lemmatizes

Works also on the query data

The training cleaned matrix can be saved an loaded

"""

class Cleaner():
    def __init__(self,use_gpu=False):
        self.parser = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False, use_gpu=use_gpu)
        self.cleaned = []

    #save and load the cleaned matrix
    def save_cleaned_matrix(self, name):
        with open(name, 'wb') as f:
            np.save(f, np.array(self.cleaned))
            
    def load_cleaned_matrix(self, name):
        with open(name, 'rb') as f:
            self.cleaned = np.load(f)

    def clean(self,text):
        self.cleaned.append(' '.join([word.lemma for sentence in self.parser(text).iter_tokens() for word in sentence.words]))

    def query_clean(self,texts):
        cleaned_texts = []
        for text in texts:
            cleaned_texts.append(' '.join([word.lemma for sentence in self.parser(text).iter_tokens() for word in sentence.words])) 
        return cleaned_texts
        

In [6]:
"""
Tf Idf class

Vecorizes the given cleaned training matrix and stores the tfidf matrix

The tfidf object does the transformation of the query clened vector

"""
class TfIdfEmbedder():
    def __init__(self,max_features=None,stop_words=None ):
        self.max_features = max_features
        self.stopwords = stop_words
        self.tfidf = TfidfVectorizer(stop_words=self.stopwords,max_features=self.max_features)
        self.builded = False
        self.tfidf_matrix = None
        
        
    def build_vectorizer(self, cleaned_text):
        self.builded = True
        self.tfidf_matrix = self.tfidf.fit_transform(cleaned_text)
        return self.tfidf_matrix 
        
    def vectorize_query(self,cleaned_texts):
        if self.builded:
            return self.tfidf.transform(cleaned_texts)
        
        else:
            raise Exception("Vectorizer is not builded.")
            


In [7]:
# #Clean the training and testing data and save the cleaned versions for later
Cl = Cleaner(  use_gpu = True)
for i,text in enumerate(Text_train):
    if i%1000 == 0:
        print(i)
    Cl.clean(text)
print("done")

TfIdf = TfIdfEmbedder(10000,stop_words_english)
tf_idf_matrix = TfIdf.build_vectorizer(Cl.cleaned)
cleaned_query = Cl.query_clean(Text_val)
q_matrix = TfIdf.vectorize_query(cleaned_query)


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
done


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm


lr = svm.SVC(max_iter=10000)


lr.fit(tf_idf_matrix, y_train)
ypred = lr.predict(q_matrix)
print(f1_score(ypred, y_val))





0.8042813455657493


Demostration of saving cleaned matrix of text to disk

In [9]:
# Cl.save_cleaned_matrix("data/cleaned_train.npy")
# with open("data/cleaned_val.npy", 'wb') as f:
#     np.save(f, np.array(cleaned_query))




Demostration of loading cleaned matrix of text to disk

In [10]:
# from sklearn.ensemble import RandomForestClassifier
# #create new classes
# TfIdf2 = TfIdfEmbedder(10000, stop_words_english)
# Cl2 = Cleaner()


# #load
# Cl2.load_cleaned_matrix("data/cleaned_train.npy")
# with open("data/cleaned_val.npy", 'rb') as f:
#     q_cleaned2 = np.load(f)

# #get tfidf matrixes from cleaneddata
# tf_idf_matrix2 = TfIdf2.build_vectorizer(Cl2.cleaned)
# q_matrix2 = TfIdf2.vectorize_query(q_cleaned2)

# #train new classifier
# lr2 =  RandomForestClassifier()
# lr2.fit(tf_idf_matrix2, y_train)

# #get new score
# ypred2 = lr2.predict(q_matrix2)
# print(f1_score(ypred2, y_val))




PCA Analysis (didn't produce good results)

In [12]:
# import umap
# from sklearn.decomposition import TruncatedSVD
# n_topics = 50

# # making latent topics
# tf_idf_svd=TruncatedSVD(n_components=n_topics)
# tf_idf_matrix_dense = tf_idf_svd.fit_transform(tf_idf_matrix2)
# print(tf_idf_matrix_dense.shape)
# # 2d transformation for visualization
# tf_idf_umap = umap.UMAP(n_neighbors=10, n_components=2)
# tf_idf_matrix_umap = tf_idf_umap.fit_transform(tf_idf_matrix_dense)
# print(tf_idf_matrix_umap.shape)




In [13]:
# import matplotlib.pyplot as plt

# def plot_songs(song_matrix, title):
#     """ Plots 2d vectors of songs and marks song with an artist label
    
#     Parameters
#     ----------
#     song_matrix : numpy array
#         columns are songs
#         rows a latent topics
#     title : str
#         title for the plot
#     songs_index : dict
#         dictionary of artist and their songs indices in the song_matrix
#     indices_to_remove : list
#         list of songs to not plot
#     """

#     plt.style.use('ggplot')
#     # pulp_indices = songs_index['pulp']
#     # princess_nokia_indices = songs_index['princess_nokia']
#     # at_the_drive_in_indices = songs_index['at_the_drive_in']

#     toxic_i = np.where(y_train == 1)[0]
#     non_toxic_i = np.where(y_train == 0)[0]
    
#     plt.title(title)
#     plt.xlabel("Feature 1")
#     plt.ylabel("Feature 2")

#     toxic = plt.scatter(song_matrix[toxic_i,0], song_matrix[toxic_i,1], marker="x", color="red")
#     non_toxic = plt.scatter(song_matrix[non_toxic_i,0], song_matrix[non_toxic_i,1], marker="o", color="cyan")

#     # at_the_drive_in = plt.scatter(song_matrix[0,at_the_drive_in_indices], song_matrix[1,at_the_drive_in_indices], marker="^", color="black")
    
#     plt.legend((toxic, non_toxic),('Toxic', 'Non-Toxic'))
    
#     plt.show()
    
# plot_songs(tf_idf_matrix_umap, "Songs as 2-D vectors (tf-idf)")


Saving the embedding object directly using the pickle library

In [14]:
# # Save the TfIdf object
# with open('tfidf_object.pkl', 'wb') as f:
#     pickle.dump(TfIdf, f)

In [15]:
# with open('tfidf_object.pkl', 'rb') as f:
#     TfIdf = pickle.load(f)

In [16]:
# with open("data/cleaned_val.npy", 'rb') as f:
#     q_cleaned = np.load(f)

In [18]:
#loading test data

test_df = pd.read_csv("test_2024.csv", quoting=csv.QUOTE_NONE, error_bad_lines=False)
Text_test = test_df["text"].to_numpy()

y_matrix = TfIdf.vectorize_query(Cl.query_clean(Text_test))





  exec(code_obj, self.user_global_ns, self.user_ns)


In [19]:
y = lr.predict(y_matrix)

with open("testidf.csv", "w") as f:
    f.write("id,label\n")
    for i,l in enumerate(y):
        f.write(str(i)+","+str(l) +"\n")
    f.close()