# Model & Evaluation

## 1. Load Packages and Datasets

In [18]:
# Install extra libraries
#!pip install bs4
#!pip install matplotlib
#!pip install seaborn
#!pip install spacy
#!pip install nltk
#!pip install wordcloud
#!pip install scikit-learn

In [19]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import json
import os
import re
import spacy
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
from wordcloud import WordCloud
from bs4 import BeautifulSoup
from IPython.display import display, HTML
#from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer



In [20]:
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [21]:
# Load pre-processed train and test set from EDA notebook
train_df = pd.read_json('./../data/preprocess_train.json')
test_df = pd.read_json('./../data/preprocess_test.json')

In [22]:
train_df.head(5)


Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,tokens_processed,tokens_count
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[design, thinking, innovation, reflexion, avri...",753
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[diego, estrada, design, thinking, assignment,...",563
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...","[reporting, process, gilberto, gamboa, challen...",729
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...","[design, thinking, innovation, sindy, samaca, ...",1071
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...","[assignment, visualization, reflection, submit...",1927


In [23]:
test_df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,tokens_processed,tokens_count
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[design, thinking, innovation, reflexion, avri...",753
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[diego, estrada, design, thinking, assignment,...",563
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[reporting, process, gilberto, gamboa, challen...",729
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[design, thinking, innovation, sindy, samaca, ...",1071
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[assignment, visualization, reflection, submit...",1927


In [24]:
# Preprocess token labels (BIO format)
#def preprocess_labels(df):
#    df['labels'] = df['labels'].apply(lambda x: [label.split('-')[-1] for label in x])
#    return df

#train_df = preprocess_labels(train_df)

def preprocess_labels(df):
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(df['labels'])
    labels_df = pd.DataFrame(labels, columns=mlb.classes_)
    df = pd.concat([df, labels_df], axis=1)
    return df#.drop(columns=['labels'])
    
    
train_df = preprocess_labels(train_df)
train_df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,tokens_processed,tokens_count,B-EMAIL,B-ID_NUM,B-NAME_STUDENT,B-PHONE_NUM,B-STREET_ADDRESS,B-URL_PERSONAL,B-USERNAME,I-ID_NUM,I-NAME_STUDENT,I-PHONE_NUM,I-STREET_ADDRESS,I-URL_PERSONAL,O
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[design, thinking, innovation, reflexion, avri...",753,0,0,1,0,0,0,0,0,1,0,0,0,1
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[diego, estrada, design, thinking, assignment,...",563,0,0,1,0,0,0,0,0,1,0,0,0,1
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...","[reporting, process, gilberto, gamboa, challen...",729,0,0,1,0,0,0,0,0,1,0,0,0,1
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...","[design, thinking, innovation, sindy, samaca, ...",1071,0,0,1,0,0,0,0,0,1,0,0,0,1
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...","[assignment, visualization, reflection, submit...",1927,0,0,1,0,0,0,0,0,1,0,0,0,1


In [25]:
# Extract relevant features (e.g., TF-IDF)
#def extract_features(df):
#    vectorizer = TfidfVectorizer()
#    X = vectorizer.fit_transform(df['full_text'])
#    return X

def extract_features(df):
    # Fill missing values with empty string
    df['full_text'].fillna('', inplace=True)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['full_text'])
    return X


# Convert multi-label problem to multi-class problem
def convert_to_multiclass(df):
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(df['labels'])
    df['class'] = np.argmax(labels, axis=1)  # Assign a class based on the maximum label
    return df

train_df = convert_to_multiclass(train_df)

# Separate features and target variable
X_train = extract_features(train_df)
y_train = train_df['class']  # Target variable is now a single column representing the class


#X_train = extract_features(train_df)
#y_train = train_df['labels']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['full_text'].fillna('', inplace=True)


In [26]:

# Step 5: Model Building

# Train a classifier (Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)