# main.ipynb


In [94]:
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV as LogReg
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, classification_report, precision_score
import pandas as pd

In [95]:
# Load the data

labelled_data = pd.read_csv('AllBooks_baseline_DTM_Labelled.csv')
labelled_data.rename(columns={'Unnamed: 0': 'label'}, inplace=True)

vocabulary = list(labelled_data.columns.values)

labelled_data.head()



Unnamed: 0,label,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,Buddhism_Ch1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buddhism_Ch2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Buddhism_Ch3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Buddhism_Ch4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buddhism_Ch5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
unlabelled_data = pd.read_csv('AllBooks_baseline_DTM_Unlabelled.csv')
labelled_data.rename(columns={'# foolishness': 'foolishness'}, inplace=True)

unlabelled_data.head()


Unnamed: 0,# foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
# # Manual pipeline from scratch, going from text corpus to the final tf-idf transformed data

# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.pipeline import Pipeline
# corpus = ["""0.1 § 1.The Buddha: "What do you think, Rahula: What is a mirror for?"The Buddha:Rahula: "For reflection, sir."Rahula:The Buddha: "In the same way, Rahula, bodily acts, verbal acts, & mental acts are to be done with repeated reflection.The Buddha:"Whenever you want to perform a bodily act, you should reflect on it: 'This bodily act I want to perform — would it lead to self-affliction, to the affliction of others, or to both? Is it an unskillful bodily act, with painful consequences, painful results?' If, on reflection, you know that it would lead to self-affliction, to the affliction of others, or to both; it would be an unskillful bodily act with painful consequences, painful results, then any bodily act of that sort is absolutely unfit for you to do. But if on reflection you know that it would not cause affliction... it would be a skillful bodily act with happy consequences, happy results, then any bodily act of that sort is fit for you to do.(Similarly with verbal acts & mental acts.)"While you are performing a bodily act, you should reflect on it: 'This bodily act I am doing — is it leading to self-affliction, to the affliction of others, or to both? Is it an unskillful bodily act, with painful consequences, painful results?' If, on reflection, you know that it is leading to self-affliction, to affliction of others, or both... you should give it up. But if on reflection you know that it is not... you may continue with it.(Similarly with verbal acts & mental acts.)"Having performed a bodily act, you should reflect on it... If, on reflection, you know that it led to self-affliction, to the affliction of others, or to both; it was an unskillful bodily act with painful consequences, painful results, then you should confess it, reveal it, lay it open to the Teacher or to a knowledgeable companion in the holy life. Having confessed it... you should exercise restraint in the future. But if on reflection you know that it did not lead to affliction... it was a skillful bodily act with happy consequences, happy results, then you should stay mentally refreshed & joyful, training day & night in skillful mental qualities.(Similarly with verbal acts.)"Having performed a mental act, you should reflect on it... If, on reflection, you know that it led to self-affliction, to the affliction of others, or to both; it was an unskillful mental act with painful consequences, painful results, then you should feel horrified, humiliated, & disgusted with it. Feeling horrified... you should exercise restraint in the future. But if on reflection you know that it did not lead to affliction... it was a skillful mental act with happy consequences, happy results, then you should stay mentally refreshed & joyful, training day & night in skillful mental qualities."Rahula, all the brahmans & contemplatives in the course of the past who purified their bodily acts, verbal acts, & mental acts, did it through repeated reflection on their bodily acts, verbal acts, & mental acts in just this way."All the brahmans & contemplatives in the course of the future... All the brahmans & contemplatives at present who purify their bodily acts, verbal acts, & mental acts, do it through repeated reflection on their bodily acts, verbal acts, & mental acts in just this way."And so, Rahula, you should train yourself: 'I will purify my bodily acts through repeated reflection. I will purify my verbal acts through repeated reflection. I will purify my mental acts through repeated reflection.' That's how you should train yourself."That is what the Blessed One said. Gratified, Ven. Rahula delighted in the Blessed One's words."""]

# # words = labelled_data
# # vocabulary = words.split()
# vocabulary = vocabulary
# pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
#                  ('tfid', TfidfTransformer())
#                 #  ,('model', MultinomialNB())
#                 ]).fit(corpus)
# pipe['count'].transform(corpus).toarray()
# # pipe['tfid'].idf_
# # pipe.transform(corpus).shape
# # # Checking output
# # l = pipe['count'].transform(corpus).toarray()
# # print(type(l))
# # print(l[0,:20])
# # pipe['model'].predict(pipe['count'].transform(corpus))
# # pipe['model'].score(pipe['count'].transform(corpus), [0])

In [98]:
# The below is for using the already vectorized BoW (bag of words) data
# and doing a TF-IDF transform on it and then feeding it into the model

from sklearn.preprocessing import LabelEncoder

# Assuming 'labelled_data' is your original dataframe
# 'labels' column is the one with text labels like "Buddhism_Ch1"

# Step 1: Separate labels from features
labels = labelled_data['label'].values  # This creates a separate array of labels
labelled_data = labelled_data.drop('label', axis=1)  # This drops the label column from the dataframe

# Step 2: Encode labels if necessary
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)  # This encodes text labels to numbers

# Finally, view output
labelled_data

Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
labelled_data.dtypes

foolishness    int64
hath           int64
wholesome      int64
takest         int64
feelings       int64
               ...  
visual         int64
thoughts       int64
illumines      int64
attire         int64
explains       int64
Length: 8266, dtype: object

In [102]:
X_train = labelled_data
y_train = encoded_labels
y_train

array([112, 123, 134, 145, 153, 154, 155, 156, 157, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       146, 147, 148, 149, 150, 151, 152, 158, 169, 180, 191, 202, 213,
       224, 235, 238, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197,
       198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211,
       212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226,
       227, 228, 229, 230, 231, 232, 233, 234, 236, 237, 239, 313, 324,
       335, 346, 357, 368, 379, 390, 240, 251, 262, 273, 284, 295, 306,
       310, 311, 312, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
       325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 336, 337, 338,
       339, 340, 341, 342, 343, 344, 345, 347, 348, 349, 350, 35

In [100]:
# from sklearn.model_selection import train_test_split

# # Could be a problem

# # Train test split
# X_train, X_test, y_train, y_test = train_test_split(
#     labelled_data, encoded_labels, test_size=0.0, random_state=42  # For example, 20% data as test
# )

# # encoded_labels.dtype

InvalidParameterError: The 'test_size' parameter of train_test_split must be a float in the range (0.0, 1.0), an int in the range [1, inf) or None. Got 0.0 instead.

In [None]:
X_train = labelled_data
y_train = encoded_labels
y_train

In [None]:
# encode

In [None]:
# Checking output of train test split
X_train


Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
67,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
168,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
548,0,11,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_test


Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
522,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
514,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
331,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
194,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
493,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_train

array([172, 505, 338,  20,  78, 365, 320, 514, 525,  28, 555, 211,  13,
       179, 426,  41, 229, 576, 484, 489, 381,  91,  10, 529,  40,  50,
        14, 142, 273, 270, 457,  71, 163, 220, 551,  55, 280,  62, 579,
       318, 285,  85,   0, 144,  74, 501, 371, 477, 314,  22, 315, 423,
       521, 582, 233,  64, 204, 310, 255, 565, 205, 520, 299, 535, 485,
       232, 354, 375, 519, 263, 421, 409, 486, 414,  53, 578, 236,   1,
       398, 540, 235, 394, 324,   5, 584, 453,  77,  93, 493, 405,  45,
       546,  92, 495, 419, 221, 350, 425, 467, 455, 574, 509, 319, 500,
       248, 490, 431, 333, 512, 240, 370, 491, 276, 247, 225, 230, 482,
       577, 586, 399,  30, 463, 545, 239,  94, 305,  16, 150, 216,  66,
       450, 368,  87, 151, 339, 170, 303, 192, 258, 358,  32, 328, 116,
       141, 133, 340, 306, 252, 471,  88, 396,   6, 203, 213, 256,  60,
       550,  80, 349,  61, 148,  17,  57, 380, 407, 302, 558, 412, 271,
       392, 153, 274,  27,  33, 449, 208, 241, 469,   9,  35, 38

In [None]:
y_test


array([ 11, 304,  58, 528, 384, 197, 413, 352,  18, 251,  19, 175, 411,
       249, 442, 323, 383, 109,  82, 353, 300, 242, 134,  36, 372, 167,
       159,  99, 307, 135, 560, 246, 100, 266, 188, 516, 301, 347,  44,
        25, 265, 385, 348, 114, 447,  46, 155,  83, 549, 297, 312,  24,
       177, 474, 267, 443, 209, 193, 583, 511, 182,  49, 466, 136, 178,
       332, 562, 440, 106, 146, 427, 382,  29, 187, 556,  79, 164, 184,
       243, 293, 393, 115, 110, 189, 539, 102, 185, 470, 219, 294, 337,
         8, 418, 282, 212, 279, 334, 284,  31, 113, 316, 553, 460, 401,
       108, 410, 190, 327, 377,  52, 575, 444, 228,  23, 218, 561, 538,
       530, 346, 321, 183, 459, 272, 547, 355, 367, 357, 564, 181, 199,
       308, 173, 119,   2, 250, 195,  89, 227, 572, 408, 139, 112,  97,
       513, 533, 563, 468, 264, 127,  12, 226, 196, 277,  48, 122, 364,
       506, 311, 298, 287,   3, 548, 552,  84, 111, 403, 238,  47, 158,
       200, 544, 217, 532, 103, 487, 441, 479, 570, 402, 585, 26

In [None]:
# Trying to skip the CountVectorizer step and directly feed the data into the TfidfTransformer
# since the spreadsheet already has the word counts, so
# effectively the spreadsheet is already the output of the CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

# Ensure the data is all numeric / float
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_test_tfidf = tfidf_transformer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB
# Other classifiers for potential future use to swap in and out
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Assuming 'X' is your frequency matrix and 'y' is your target array
model = MultinomialNB()
# model = RandomForestClassifier()
# model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
# print("Model accuracy:", model.score(X_test_tfidf, y_test))  # Evaluating the model


Model accuracy: 0.0


In [None]:
# Debugging why the model is not working and accuracy is 0.0

# View some of the TF-IDF features
print(X_train_tfidf[0])



  (0, 7318)	0.09386545925999627
  (0, 7246)	0.09078355671047175
  (0, 7245)	0.14368658169465476
  (0, 7071)	0.2873731633893095
  (0, 6873)	0.14368658169465476
  (0, 6861)	0.13397208557195053
  (0, 6289)	0.14368658169465476
  (0, 6081)	0.12173326787292207
  (0, 6012)	0.07074318699727335
  (0, 5741)	0.14368658169465476
  (0, 5734)	0.14368658169465476
  (0, 5342)	0.12707954088310194
  (0, 5009)	0.18773091851999255
  (0, 4991)	0.14368658169465476
  (0, 4959)	0.14368658169465476
  (0, 4842)	0.14368658169465476
  (0, 4794)	0.11367176474308324
  (0, 4754)	0.12173326787292207
  (0, 4488)	0.1321828467032009
  (0, 4246)	0.13397208557195053
  (0, 4237)	0.07961693385837598
  (0, 4207)	0.11736504476039769
  (0, 4115)	0.17949623385186367
  (0, 3984)	0.11736504476039769
  (0, 3968)	0.11736504476039769
  (0, 3932)	0.07251873215209036
  (0, 3683)	0.095411730938665
  (0, 3556)	0.13397208557195053
  (0, 3365)	0.12173326787292207
  (0, 3149)	0.10075800394884486
  (0, 3057)	0.20151600789768973
  (0, 3023)	

In [None]:
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
print(clf.predict(X[2:3]))

[3]
