In [1]:
## loading original data extraction into a data frame for embedding
import pandas as pd
parquet_files=['book.parquet','home.parquet','personal_care.parquet']
dataframes=[pd.read_parquet(files) for files in parquet_files]
df=pd.concat(dataframes,ignore_index=True)

In [39]:
## labeling the response column with numerical value
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df['response']=label_encoder.fit_transform(df['main_category'])


In [3]:
## removing all numbers in the title columns: 
import re
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
def remove_num(text):
    text=remove_punctuation(text)
    return re.sub(r'\d+','',text)

#df['title']= df['title'].apply(remove_num)

In [8]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Function to lemmatize text - remove stop words
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop])



In [15]:
## converting using TF-IDF
#from sklearn.feature_extraction.text import TfidfVectorizer
#tdif=TfidfVectorizer(stop_words='english', preprocessor=lemmatize_text)  ## we can customize stop_words list as well . using default stop_words='english' has very minimal list. Instead we process stop_words and remove it using spacy  
#X=tdif.fit_transform(df['title'])


In [27]:
## saving the vectorizer and resulting tf-dif matrix to a compressed file for future use. 
#from scipy.sparse import save_npz, load_npz
#from sklearn.feature_extraction.text import TfidfVectorizer
#import joblib

#joblib.dump((tdif,X), 'tfidf_vectorizer.pkl.gz',compress=('gzip',3))



['tfidf_vectorizer.pkl.gz']

In [21]:
#print(tdif.get_feature_names_out()[101:150])

['AASLH' 'AASMComfort' 'AASVOG' 'AAT' 'AATBS' 'AATKRMFFV' 'AATMANA'
 'AATQRFV' 'AATwengeDesk' 'AAU' 'AAUS' 'AAUSB' 'AAUU' 'AAUW' 'AAVBO'
 'AAVIX' 'AAVMDFBS' 'AAVNI' 'AAVRANI' 'AAVandegriftUSMC' 'AAW' 'AAWD'
 'AAWESGS' 'AAXplosion' 'AAYU' 'AAZZEUSAM' 'AAZZKANG' 'AAbcalet'
 'AAkatsuki' 'AAkron' 'AAlmond' 'AAmbi' 'AAmerica' 'AAndrea' 'AAngel'
 'AAobosi' 'AArbutin' 'AArm' 'AArt' 'AAsXXX' 'AAtlonia' 'AAtter' 'AAugust'
 'AB' 'ABA' 'ABAAARP' 'ABABA' 'ABACAD' 'ABACUS']


In [28]:
## Loading the vectorizer from disk 
import joblib
from scipy.sparse import load_npz

v,X=joblib.load('tfidf_vectorizer.pkl.gz')

# Testing with new text data using the loaded vectorizer
new_texts = ["New data to transform."]
new_tfidf_matrix = v.transform(new_texts)


In [41]:
print("matrix shape ", X.shape)

matrix shape  (6000000, 813248)


Training logistic regression with 1 variable 'Title' - X is the embedding for title column. 

In [34]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,df['response'],train_size=.8,random_state=123)

In [31]:
from sklearn.linear_model import LogisticRegressionCV
logmod=LogisticRegressionCV(multi_class='multinomial',cv=5).fit(x_train,y_train)

In [35]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ytrain_pred=logmod.predict(x_train)
log_train_matrix=confusion_matrix(y_train,ytrain_pred)
print('train accuracy', accuracy_score(y_train,ytrain_pred))
print(log_train_matrix)

train accuracy 0.9730125
[[1575024   13430   11380]
 [  30709  746445   22543]
 [  28863   22615 2348991]]


In [37]:
print(classification_report(y_train,ytrain_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97   1599834
           1       0.95      0.93      0.94    799697
           2       0.99      0.98      0.98   2400469

    accuracy                           0.97   4800000
   macro avg       0.97      0.97      0.97   4800000
weighted avg       0.97      0.97      0.97   4800000



In [38]:
ytest_pred=logmod.predict(x_test)
log_test_matrix=confusion_matrix(y_test,ytest_pred)
print("test accuracy",accuracy_score(y_test,ytest_pred))
print(log_test_matrix)
print(classification_report(y_test,ytest_pred))

test accuracy 0.972255
[[393802   3382   2982]
 [  8032 186479   5792]
 [  7296   5810 586425]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    400166
           1       0.95      0.93      0.94    200303
           2       0.99      0.98      0.98    599531

    accuracy                           0.97   1200000
   macro avg       0.97      0.96      0.97   1200000
weighted avg       0.97      0.97      0.97   1200000

