In [1]:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import io
import re
import tempfile
import os
from tqdm import tqdm

import torch

from google.colab import drive
drive.mount('/content/drive')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix


Mounted at /content/drive


In [2]:
export_folder="/content/drive/My Drive/"
output_path=os.path.join(export_folder,"cleaned_without_dictionary.csv")
df=pd.read_csv(output_path, index_col=0)
# df=df[["primary_key","text","label"]]
# df=df[df["label"]!="unlabeled"] # skip unlabled data, future target files
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62643 entries, 1000 to 88842
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   primary_key  62643 non-null  int64 
 1   text         62633 non-null  object
 2   label        62643 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


# data processing

In [3]:
df['text'] = df['text'].fillna("")
# Encode the target labels (may not needed)
# label_encoder = LabelEncoder()
# df['label'] = label_encoder.fit_transform(df['label'])

# Split the data into features (X) and target (y)
X = df['text']
y = df['label']
del df # delete to release memory

# First, split the data into training (70%) and holdout test set (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=100,stratify=y)
# Then, split the remaining 30% into validation (20%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, random_state=100,stratify=y_temp)

del X
del y

# pipeline

In [None]:
# Create a TF-IDF Vectorizer and Logistic Regression model within a pipeline
model = make_pipeline(
    TfidfVectorizer(max_features=2000),  # You can adjust the max_features
    LogisticRegression(max_iter=400)

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean()}")

Cross-validation scores: [0.74367161 0.73899658 0.73808438 0.73888255 0.73466363]
Mean cross-validation accuracy: 0.7388597491448119


In [None]:
model.fit(X_train, y_train)

In [None]:
# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)
val_score = model.score(X_val, y_val)
print(f"Validation accuracy: {val_score}")

# Print confusion matrix for the validation set
val_cm = classification_report(y_val, y_val_pred)
print("Validation : classification report")
print(val_cm)

Validation accuracy: 0.7424169859514687
Validation :
                              precision    recall  f1-score   support

          $ Licensee Invoice       0.98      0.91      0.94       161
       $ Patent Cost Invoice       0.88      0.95      0.92       183
           $ Royalty Payment       0.97      0.95      0.96       681
      Amendment to Agreement       0.90      0.75      0.82        84
    Assessment Results Table       1.00      0.98      0.99       233
                  Assignment       0.86      0.79      0.82       510
   Confidentiality Agreement       0.98      0.93      0.96       275
                 Declaration       0.82      0.74      0.78       184
           Draft Application       0.71      0.24      0.36        41
              Draft Response       0.00      0.00      0.00        10
              Filing Receipt       0.81      0.78      0.80       802
IDS (Info. Discl. Statement)       0.88      0.85      0.87       318
                 Information       0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# tuning

In [4]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'logisticregression__C': [1,5,10],  # Regularization strength
    'logisticregression__penalty': ['l2'],  # L2 regularization (try 'l1' or 'elasticnet' if supported)
    'logisticregression__solver': ['liblinear', 'saga'],  # Solvers to use
    'logisticregression__max_iter': [200, 500],  # Max iterations to converge
}

model_pipeline = make_pipeline(
    TfidfVectorizer(max_features=3000),
    LogisticRegression()
)
grid_search = GridSearchCV(
    model_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=1
)
# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score from GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'logisticregression__C': 10, 'logisticregression__max_iter': 500, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'saga'}
Best cross-validation score: 0.7596351197263397


In [5]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
y_val_pred = best_model.predict(X_val)
val_score = best_model.score(X_val, y_val)
print(f"Validation accuracy: {val_score}")

# Print classification report for validation set
# print("Validation Classification Report:")
# print(classification_report(y_val, y_val_pred))

Validation accuracy: 0.7648467432950191


In [6]:
# Evaluate the best model on the validation set
y_test_pred = best_model.predict(X_test)
test_score = best_model.score(X_test, y_test)
print(f"Validation accuracy: {test_score}")

# Print classification report for validation set
print("Validation Classification Report:")
print(classification_report(y_test, y_test_pred))

Validation accuracy: 0.7653631284916201
Validation Classification Report:
                              precision    recall  f1-score   support

          $ Licensee Invoice       0.99      0.95      0.97        80
       $ Patent Cost Invoice       0.94      0.99      0.96        91
           $ Royalty Payment       0.96      0.97      0.97       341
      Amendment to Agreement       0.90      0.83      0.86        42
    Assessment Results Table       1.00      0.99      1.00       116
                  Assignment       0.89      0.88      0.88       255
   Confidentiality Agreement       0.98      0.99      0.98       138
                 Declaration       0.86      0.80      0.83        92
           Draft Application       0.50      0.40      0.44        20
              Draft Response       0.00      0.00      0.00         5
              Filing Receipt       0.81      0.83      0.82       402
IDS (Info. Discl. Statement)       0.89      0.87      0.88       159
               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print( grid_search.best_estimator_)