# main.ipynb


In [37]:
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV as LogReg
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, classification_report, precision_score
import pandas as pd

In [38]:
# Load the data

labelled_data = pd.read_csv('AllBooks_baseline_DTM_Labelled.csv')
labelled_data.rename(columns={'Unnamed: 0': 'label'}, inplace=True)

vocabulary = list(labelled_data.columns.values)

labelled_data.head()



Unnamed: 0,label,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,Buddhism_Ch1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buddhism_Ch2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Buddhism_Ch3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Buddhism_Ch4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buddhism_Ch5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# unlabelled_data = pd.read_csv('AllBooks_baseline_DTM_Unlabelled.csv')
unlabelled_data = pd.read_csv('AllBooks_baseline_DTM_Unlabelled_duplicate.csv')
labelled_data.rename(columns={'# foolishness': 'foolishness'}, inplace=True)

unlabelled_data.head()


Unnamed: 0,# foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# The below is for using the already vectorized BoW (bag of words) data
# and doing a TF-IDF transform on it and then feeding it into the model

from sklearn.preprocessing import LabelEncoder

# Assuming 'labelled_data' is your original dataframe
# 'labels' column is the one with text labels like "Buddhism_Ch1"

# Step 1: Separate labels from features
labels = labelled_data['label'].values  # This creates a separate array of labels
labelled_data = labelled_data.drop('label', axis=1)  # This drops the label column from the dataframe

# Step 2: Encode labels if necessary
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)  # This encodes text labels to numbers

# Finally, view output
labelled_data

Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
labelled_data.dtypes

foolishness    int64
hath           int64
wholesome      int64
takest         int64
feelings       int64
               ...  
visual         int64
thoughts       int64
illumines      int64
attire         int64
explains       int64
Length: 8266, dtype: object

In [42]:
X_train = labelled_data
y_train = encoded_labels
y_train

array([112, 123, 134, 145, 153, 154, 155, 156, 157, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       146, 147, 148, 149, 150, 151, 152, 158, 169, 180, 191, 202, 213,
       224, 235, 238, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197,
       198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211,
       212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226,
       227, 228, 229, 230, 231, 232, 233, 234, 236, 237, 239, 313, 324,
       335, 346, 357, 368, 379, 390, 240, 251, 262, 273, 284, 295, 306,
       310, 311, 312, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
       325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 336, 337, 338,
       339, 340, 341, 342, 343, 344, 345, 347, 348, 349, 350, 35

In [43]:
# from sklearn.model_selection import train_test_split

# # Could be a problem

# # Train test split
# X_train, X_test, y_train, y_test = train_test_split(
#     labelled_data, encoded_labels, test_size=0.5, random_state=42  # For example, 20% data as test
# )

# # encoded_labels.dtype

In [44]:
X_train = labelled_data
y_train = encoded_labels
y_train

array([112, 123, 134, 145, 153, 154, 155, 156, 157, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       146, 147, 148, 149, 150, 151, 152, 158, 169, 180, 191, 202, 213,
       224, 235, 238, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197,
       198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211,
       212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226,
       227, 228, 229, 230, 231, 232, 233, 234, 236, 237, 239, 313, 324,
       335, 346, 357, 368, 379, 390, 240, 251, 262, 273, 284, 295, 306,
       310, 311, 312, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
       325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 336, 337, 338,
       339, 340, 341, 342, 343, 344, 345, 347, 348, 349, 350, 35

In [45]:
# encode

In [46]:
# Checking output of train test split
X_train


Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# X_test
# X_test.dtypes

In [48]:
y_train

array([112, 123, 134, 145, 153, 154, 155, 156, 157, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       146, 147, 148, 149, 150, 151, 152, 158, 169, 180, 191, 202, 213,
       224, 235, 238, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197,
       198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211,
       212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226,
       227, 228, 229, 230, 231, 232, 233, 234, 236, 237, 239, 313, 324,
       335, 346, 357, 368, 379, 390, 240, 251, 262, 273, 284, 295, 306,
       310, 311, 312, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
       325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 336, 337, 338,
       339, 340, 341, 342, 343, 344, 345, 347, 348, 349, 350, 35

In [49]:
# y_test


In [50]:
# Trying to skip the CountVectorizer step and directly feed the data into the TfidfTransformer
# since the spreadsheet already has the word counts, so
# effectively the spreadsheet is already the output of the CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix

# Ensure the data is all numeric / float
X_train = X_train.astype(np.float64)
# X_test = X_test.astype(np.float64)

X_train_sparse = csr_matrix(X_train.values)

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train)


# X_test_tfidf = tfidf_transformer.transform(X_test)

# X_train_tfidf.dtypes

In [51]:
# X_train_tfidf.head()

X_train

Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
from sklearn.naive_bayes import MultinomialNB
# Other classifiers for potential future use to swap in and out
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Assuming 'X' is your frequency matrix and 'y' is your target array
model = MultinomialNB()
# model.fit(X_train_tfidf, y_train)
model.fit(X_train, y_train)
# print("Model accuracy:", model.score(X_test_tfidf, y_test))  # Evaluating the model


In [53]:
# Ensure unlabelled_data has correct data types (float) and correct any label issues.
unlabelled_data = unlabelled_data.astype(np.float64)

unlabelled_data_sparse = csr_matrix(unlabelled_data.values)

# Apply the same TF-IDF transformation on unlabelled data

# Not sure if True or False below in use_idf is correct
transformer = TfidfTransformer(use_idf=False)
unlabelled_data_tfidf = transformer.transform(X=unlabelled_data_sparse)

# Predict using the trained model
unlabelled_predictions = model.predict(unlabelled_data_tfidf)

# Optionally, you can transform the numerical predictions back to label names using the inverse transform of LabelEncoder
predicted_labels = label_encoder.inverse_transform(unlabelled_predictions)




In [54]:
predicted_labels

# I wonder if the below is right? Wondering why it is all in order

array(['Buddhism_Ch1', 'Buddhism_Ch3', 'Buddhism_Ch4', 'Buddhism_Ch5',
       'Buddhism_Ch5', 'Buddhism_Ch6', 'Buddhism_Ch7', 'Buddhism_Ch8',
       'Buddhism_Ch9', 'Buddhism_Ch10', 'Buddhism_Ch11', 'Buddhism_Ch12',
       'Buddhism_Ch21', 'BookOfEccleasiasticus_Ch1', 'Buddhism_Ch15',
       'Buddhism_Ch16', 'Buddhism_Ch17', 'Buddhism_Ch18', 'Buddhism_Ch19',
       'Buddhism_Ch20', 'Buddhism_Ch21', 'Buddhism_Ch22', 'Buddhism_Ch23',
       'Buddhism_Ch24', 'Buddhism_Ch25', 'Buddhism_Ch26', 'Buddhism_Ch27',
       'Buddhism_Ch28', 'Buddhism_Ch29', 'Buddhism_Ch30', 'Buddhism_Ch31',
       'Buddhism_Ch32', 'Buddhism_Ch33', 'Buddhism_Ch34', 'Buddhism_Ch35',
       'Buddhism_Ch36', 'Buddhism_Ch37', 'Buddhism_Ch38', 'Buddhism_Ch39',
       'Buddhism_Ch40', 'Buddhism_Ch41', 'Buddhism_Ch42', 'Buddhism_Ch43',
       'Buddhism_Ch44', 'Buddhism_Ch45', 'Buddhism_Ch46',
       'TaoTeChing_Ch1', 'TaoTeChing_Ch2', 'TaoTeChing_Ch3',
       'TaoTeChing_Ch4', 'TaoTeChing_Ch5', 'TaoTeChing_Ch6',
       'T

Notes on Presentation:
-Data
-Features
-Models
-Optimize parameters of models
-results
-validation
-comparison and conclusion of models
-contributions

In [55]:
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
print(clf.predict(X[2:3]))


# Cross validation tests

# dirichlet MultinomialNB

# multinomial regression

# clustering??

# classification problem

# skewed data, take the log of the data


[3]
