Opening

In [63]:
import os
import pandas as pd
import json

# Specify the directory containing the JSON files
folder_path = 'json_per_collection'

# Get a list of all JSON files in the directory
json_files = [pos_json for pos_json in os.listdir(folder_path) if pos_json.endswith('.json')]

# Initialize an empty dataframe
df = pd.DataFrame()

# For each JSON file
for index, js in enumerate(json_files):
    with open(os.path.join(folder_path, js)) as json_file:
        json_text = json.load(json_file)
        
        # Remove the 'embeds' attribute
        if 'embeds' in json_text:
            del json_text['embeds']

        # Convert JSON to dataframe
        json_df = pd.json_normalize(json_text)

        # Concatenate the dataframes
        if index == 0:
            df = json_df
        else:
            df = pd.concat([df, json_df], ignore_index=True)


In [64]:
# Assuming df is your initial dataframe
df = df.set_index('uuid')  # Sets 'uuid' as the index if you want to keep it but not include it in the melting process

# Melt dataframe into two columns
df_melted = df.melt(id_vars='partition_name', var_name='attribute', value_name='value')

df = df_melted 

In [65]:
# Assuming 'nan' is an actual NaN value in the DataFrame, not a string 'nan'.
# If it's a string, use pd.notna(df['value']) instead of df['value'] != 'nan'

# The corrected filtering logic:
df = df[
    (pd.notna(df['value'])) &                # Remove rows with NaN in 'value' column
    (df['value'] != ' ') &                   # Remove rows with empty strings in 'value' column
    (df['attribute'] != 'link') &            # Remove rows with 'link' in 'attribute' column
    (df['attribute'] != 'media') &           # Remove rows with 'media' in 'attribute' column
    (df['attribute'] != 'uuid') & 
    (df['attribute'] != 'embeds') & # Remove rows with 'uuid' in 'attribute' column
    (df['attribute'] != 'text_id')           # Remove rows with 'text_id' in 'attribute' column
]

df.dropna(inplace=True)  # Drop any remaining rows with NaN values in the DataFrame


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)  # Drop any remaining rows with NaN values in the DataFrame


In [67]:
import pandas as pd

# Assuming you have already filtered and processed the DataFrame as shown before.
# For demonstration purposes, let's create a sample DataFrame.

# Function to split the text into chunks of 20 words each.
def split_text_into_chunks(text, chunk_size):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Create a new DataFrame to store the results.
new_rows = []
chunk_size = 20

# Iterate over each row in the DataFrame.
for index, row in df.iterrows():
    # Split the 'value' column into chunks.
    chunks = split_text_into_chunks(row['value'], chunk_size)
    
    # Create new rows with each chunk and the corresponding 'partition_name'.
    for chunk in chunks:
        new_row = {
            'partition_name': row['partition_name'],
            'attribute': row['attribute'],
            'value': chunk
        }
        new_rows.append(new_row)

# Create a new DataFrame from the list of new rows.
df_new = pd.DataFrame(new_rows)

# Save the new DataFrame to a CSV file named "classifier.csv"
df_new.to_csv("classifier.csv", index=False)


In [71]:
display(df[df['partition_name'] == 'people_partition'])

Unnamed: 0,partition_name,attribute,value,embeds
164,people_partition,position,VICE PRESIDENT FOR ACADEMICS (VP FOR ACADEMICS),"[-0.00683437, 0.044967495, -0.035653494, 0.040..."
165,people_partition,position,VICE PRESIDENT FOR ADMINISTRATION (VP FOR ADMI...,"[0.0033901546, 0.005877425, -0.031651184, 0.02..."
166,people_partition,position,VICE PRESIDENT FOR FINANCE (VP FOR FINANCE),"[0.018101703, 0.02787736, -0.02830598, 0.02263..."
167,people_partition,position,VICE PRESIDENT FOR STUDENT WELFARE (VP FOR STU...,"[-0.0012258334, 0.042073455, -0.017293401, 0.0..."
168,people_partition,position,VICE PRESIDENT FOR RELIGIOUS AFFAIRS (VP FOR R...,"[-0.0053628767, 0.03796102, -0.014453557, 0.04..."
...,...,...,...,...
2913,people_partition,name,"Rev. Fr. Rouel M. Sia, OAR","[0.027192142, -0.071821466, -0.03791815, -0.00..."
2914,people_partition,name,"Rev. Fr. Leopoldo V. Estioko, OAR","[0.011869252, -0.048566006, -0.07479137, 0.012..."
2915,people_partition,name,"Rev. Fr. Roy Baluarte, OAR","[0.020502523, -0.050242495, -0.037205607, 0.00..."
2916,people_partition,name,"Rev. Fr. Arian Josef Ocheda, OAR","[0.00058731483, -0.044211593, -0.074408576, -0..."


In [1]:
df

NameError: name 'df' is not defined

In [70]:
import fasttext

# Path to the downloaded pre-trained model file
model_path = '/Users/garfieldgreglim/Library/Mobile Documents/com~apple~CloudDocs/Josenian-Query/Embedder/crawl-300d-2M-subword.bin'

# Load the pre-trained model
model = fasttext.load_model(model_path)

# Function to embed text using the fasttext model
def embed_text(text):
    text = str(text)  # ensures that the text is a string
    text = text.replace('\n', ' ') # replace newline characters with a space
    return model.get_sentence_vector(text)

# Convert 'value' to string, remove newline characters, then apply the embed_text function
df['value'] = df['value'].apply(lambda x: str(x).replace('\n', ' ')) # ensures no new lines exist in the entire dataframe
df['embeds'] = df['value'].apply(embed_text)




In [72]:
df

Unnamed: 0,partition_name,attribute,value,embeds
0,documents_partition,title,Timeless Existence and Principle of Creation: ...,"[0.00012078498, -0.03965601, -0.00351918, 0.00..."
1,documents_partition,title,Ratooning Response of Lowland Rice (Oryza sati...,"[-0.0075855725, -0.026818236, 0.012876515, -0...."
2,documents_partition,title,Paternal Resilience in Time of Pandemic: A Phe...,"[-0.0009952185, -0.055157054, 0.03446741, 0.02..."
3,documents_partition,title,An Inquiry into the Problems Concerning Filipi...,"[-0.00086568197, -0.004538446, 0.035147678, -0..."
4,documents_partition,title,Correlating the Psychological and Spiritual We...,"[-0.014498946, -0.059247393, 0.032215312, 0.02..."
...,...,...,...,...
2913,people_partition,name,"Rev. Fr. Rouel M. Sia, OAR","[0.027192142, -0.071821466, -0.03791815, -0.00..."
2914,people_partition,name,"Rev. Fr. Leopoldo V. Estioko, OAR","[0.011869252, -0.048566006, -0.07479137, 0.012..."
2915,people_partition,name,"Rev. Fr. Roy Baluarte, OAR","[0.020502523, -0.050242495, -0.037205607, 0.00..."
2916,people_partition,name,"Rev. Fr. Arian Josef Ocheda, OAR","[0.00058731483, -0.044211593, -0.074408576, -0..."


In [87]:
import joblib

# save models
joblib.dump(clf_attribute, 'clf_attribute.pkl')
joblib.dump(clf_partition, 'clf_partition.pkl')

# save encoders
joblib.dump(le_attribute, 'le_attribute.pkl')
joblib.dump(le_partition, 'le_partition.pkl')


['le_partition.pkl']

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# Label encoding for the categorical variables
le_partition = LabelEncoder()
le_attribute = LabelEncoder()
df['partition_encoded'] = le_partition.fit_transform(df['partition_name'])
df['attribute_encoded'] = le_attribute.fit_transform(df['attribute'])

# Model 1: Predict 'attribute'

# split features and labels for 'attribute'
X_attribute = np.stack(df['embeds'].values)  # features
y_attribute = df['attribute_encoded'].values  # label

# split data into train and test sets
X_train_attribute, X_test_attribute, y_train_attribute, y_test_attribute = train_test_split(X_attribute, y_attribute, test_size=0.2, random_state=42)

# train the model
clf_attribute = SVC(random_state=0, probability=True).fit(X_train_attribute, y_train_attribute)

# evaluate the model
y_pred_attribute = clf_attribute.predict(X_test_attribute)
print("Classification report for attribute prediction:")
print(classification_report(y_test_attribute, y_pred_attribute))

# Model 2: Predict 'partition_name'

# split features and labels for 'partition_name'
X_partition = np.stack(df['embeds'].values)  # features
y_partition = df['partition_encoded'].values  # label

# split data into train and test sets
X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X_partition, y_partition, test_size=0.2, random_state=42)

# train the model
# train the model with probability=True

# same for the other model
clf_partition = SVC(random_state=0, probability=True).fit(X_train_partition, y_train_partition)


import joblib

# save models
joblib.dump(clf_attribute, 'clf_attribute.pkl')
joblib.dump(clf_partition, 'clf_partition.pkl')

# save encoders
joblib.dump(le_attribute, 'le_attribute.pkl')
joblib.dump(le_partition, 'le_partition.pkl')


# evaluate the model
y_pred_partition = clf_partition.predict(X_test_partition)
print("\nClassification report for partition_name prediction:")
print(classification_report(y_test_partition, y_pred_partition))


def predict_attribute(embeds):
    # transform input to the right format
    X = np.stack([embeds])

    # predict probabilities across all possible labels
    probas = clf_attribute.predict_proba(X)[0]

    # get class labels in descending order of probability
    classes = clf_attribute.classes_
    ranked_classes = [x for _, x in sorted(zip(probas, classes), reverse=True)]

    # return the names instead of the encoded labels
    return le_attribute.inverse_transform(ranked_classes)

def predict_partition(embeds):
    # transform input to the right format
    X = np.stack([embeds])

    # predict probabilities across all possible labels
    probas = clf_partition.predict_proba(X)[0]

    # get class labels in descending order of probability
    classes = clf_partition.classes_
    ranked_classes = [x for _, x in sorted(zip(probas, classes), reverse=True)]

    # return the names instead of the encoded labels
    return le_partition.inverse_transform(ranked_classes)


Classification report for attribute prediction:
              precision    recall  f1-score   support

           0       0.88      0.97      0.93        38
           1       1.00      1.00      1.00        44
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.99      0.99      0.99       467
           5       0.93      0.93      0.93        30

    accuracy                           0.98       584
   macro avg       0.63      0.65      0.64       584
weighted avg       0.97      0.98      0.97       584



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification report for partition_name prediction:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       434
           1       1.00      0.29      0.45        24
           2       0.82      0.84      0.83       126

    accuracy                           0.92       584
   macro avg       0.92      0.70      0.75       584
weighted avg       0.92      0.92      0.91       584



In [82]:
def predict_attribute(embeds):
    # transform input to the right format
    X = np.stack([embeds])

    # predict probabilities across all possible labels
    probas = clf_attribute.predict_proba(X)[0]

    # get class labels in descending order of probability
    classes = clf_attribute.classes_
    ranked_classes = [x for _, x in sorted(zip(probas, classes), reverse=True)]

    # return the names instead of the encoded labels
    return le_attribute.inverse_transform(ranked_classes)

def predict_partition(embeds):
    # transform input to the right format
    X = np.stack([embeds])

    # predict probabilities across all possible labels
    probas = clf_partition.predict_proba(X)[0]

    # get class labels in descending order of probability
    classes = clf_partition.classes_
    ranked_classes = [x for _, x in sorted(zip(probas, classes), reverse=True)]

    # return the names instead of the encoded labels
    return le_partition.inverse_transform(ranked_classes)


In [83]:
predict_attribute(embed_text("emiliano"))

array(['text', 'author', 'title', 'date', 'name', 'position'],
      dtype=object)

In [86]:
predict_partition(embed_text("emiliano"))

array(['documents_partition', 'social_posts_partition',
       'people_partition'], dtype=object)

In [18]:
# Assuming you have already filtered and processed the DataFrame as shown in the previous code.

# Save the DataFrame to a CSV file named "classifier.csv"
df.to_csv("classifier.csv", index=False)
