In [None]:
import zipfile
import os

zip_path = "/content/archive (1).zip"   # change if needed
extract_path = "/content/legalclausedataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("‚úÖ Files extracted to:", extract_path)


‚úÖ Files extracted to: /content/legalclausedataset


In [None]:
# List some folders/files
import os

for root, dirs, files in os.walk(extract_path):
    print(f"üìÅ {root}, {len(files)} files")
    break  # just top level


üìÅ /content/legalclausedataset, 395 files


In [None]:
import pandas as pd
import glob

# Read all CSVs
csv_files = glob.glob(os.path.join(extract_path, "*.csv"))

dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    df['clause_type'] = os.path.basename(file).replace('.csv', '')
    dataframes.append(df)

# Combine into one DataFrame
clauses_df = pd.concat(dataframes, ignore_index=True)

# Clean text
clauses_df.rename(columns={'clause_text': 'text'}, inplace=True)
clauses_df['text'] = clauses_df['text'].astype(str).str.strip()
clauses_df.drop_duplicates(subset=['text'], inplace=True)

print("‚úÖ Combined shape:", clauses_df.shape)
clauses_df.head()

‚úÖ Combined shape: (150545, 2)


Unnamed: 0,text,clause_type
0,Exceptions. Any other provision herein to the ...,exceptions
1,Exceptions. (a) This Clause 28 shall not apply...,exceptions
2,Exceptions. (a) Clause 14.1 (Increased costs) ...,exceptions
3,Exceptions. (a) An amendment or waiver that ha...,exceptions
4,Exceptions. Recipient will not have an obligat...,exceptions


In [None]:
clauses_df.to_csv("/content/all_clauses.csv", index=False)
print("üíæ Saved as all_clauses.csv")


üíæ Saved as all_clauses.csv


In [None]:
df = pd.read_csv("/content/all_clauses.csv")
df.head()

Unnamed: 0,text,clause_type
0,Exceptions. Any other provision herein to the ...,exceptions
1,Exceptions. (a) This Clause 28 shall not apply...,exceptions
2,Exceptions. (a) Clause 14.1 (Increased costs) ...,exceptions
3,Exceptions. (a) An amendment or waiver that ha...,exceptions
4,Exceptions. Recipient will not have an obligat...,exceptions


In [None]:
import pandas as pd

# View structure
print(df.info())
print(df.head())

# Check for missing or empty text
print("Missing text:", df['text'].isna().sum())
print("Unique clause types:", df['clause_type'].nunique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150545 entries, 0 to 150544
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   text         150545 non-null  object
 1   clause_type  150545 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB
None
                                                text clause_type
0  Exceptions. Any other provision herein to the ...  exceptions
1  Exceptions. (a) This Clause 28 shall not apply...  exceptions
2  Exceptions. (a) Clause 14.1 (Increased costs) ...  exceptions
3  Exceptions. (a) An amendment or waiver that ha...  exceptions
4  Exceptions. Recipient will not have an obligat...  exceptions
Missing text: 0
Unique clause types: 394


In [None]:
# Drop rows where text is missing
df = df.dropna(subset=['text'])

# Remove empty strings or very short text
df = df[df['text'].str.strip().str.len() > 10]

# Drop exact duplicates
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)


In [None]:
import re

def clean_clause(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)             # remove extra spaces/newlines
    text = re.sub(r'[‚Äú‚Äù]', '"', text)            # normalize quotes
    text = re.sub(r'[‚Äô‚Äò]', "'", text)            # normalize apostrophes
    text = re.sub(r'‚Äì', '-', text)               # normalize dash
    text = re.sub(r'\xa0', ' ', text)            # remove non-breaking spaces
    text = text.strip()
    return text

df['clean_text'] = df['text'].apply(clean_clause)


In [None]:
from langdetect import detect
from tqdm import tqdm

def safe_detect(text):
    try:
        return detect(text)
    except:
        return "unknown"

tqdm.pandas()
df['lang'] = df['clean_text'].progress_apply(safe_detect)
df = df[df['lang'] == 'en']


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150545/150545 [06:33<00:00, 382.88it/s]


In [None]:
label_counts = df['clause_type'].value_counts()
print(label_counts)


clause_type
time-of-essence                           630
time-of-the-essence                       620
capitalized-terms                         590
definitions-and-interpretation            590
captions                                  580
                                         ... 
tax-returns                               138
trustee-may-file-proofs-of-claim          131
fees_royalties                            123
marketing                                  48
standard-terms-and-conditions-of-trust     15
Name: count, Length: 394, dtype: int64


In [None]:
rare_labels = label_counts[label_counts < 30].index
df = df[~df['clause_type'].isin(rare_labels)]


In [None]:
print("Final shape:", df.shape)
print(df.sample(5))

# Check clause length distribution
df['length'] = df['clean_text'].apply(lambda x: len(x.split()))
print(df['length'].describe())


Final shape: (150180, 4)
                                                     text  \
15257   W I T N E S S E T H   WHEREAS the Issuers have...   
58218   Title. Such Stockholder is the sole record or ...   
27184   Employee Benefit Plans. No ERISA Event has occ...   
111333  Absence of Certain Changes. Since December 31,...   
67080   Base Salary. The Company shall pay or cause to...   

                        clause_type  \
15257   w-i-t-n-e-s-s-e-t-h-whereas   
58218                         title   
27184        employee-benefit-plans   
111333   absence-of-certain-changes   
67080                   base-salary   

                                               clean_text lang  
15257   W I T N E S S E T H WHEREAS the Issuers have h...   en  
58218   Title. Such Stockholder is the sole record or ...   en  
27184   Employee Benefit Plans. No ERISA Event has occ...   en  
111333  Absence of Certain Changes. Since December 31,...   en  
67080   Base Salary. The Company shall pay or ca

In [None]:
df[['clean_text', 'clause_type']].to_csv('/content/clean_legal_clauses.csv', index=False)
print("‚úÖ Cleaned dataset saved to /content/clean_legal_clauses.csv")


‚úÖ Cleaned dataset saved to /content/clean_legal_clauses.csv


In [None]:
import pandas as pd

df = pd.read_csv("/content/clean_legal_clauses.csv")
print(df.shape)
print(df.columns)
df.head(5)


(150180, 2)
Index(['clean_text', 'clause_type'], dtype='object')


Unnamed: 0,clean_text,clause_type
0,Exceptions. Any other provision herein to the ...,exceptions
1,Exceptions. (a) This Clause 28 shall not apply...,exceptions
2,Exceptions. (a) Clause 14.1 (Increased costs) ...,exceptions
3,Exceptions. (a) An amendment or waiver that ha...,exceptions
4,Exceptions. Recipient will not have an obligat...,exceptions


In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install torch -q
!pip install scikit-learn -q


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("clean_legal_clauses.csv")
df = df.dropna(subset=["clean_text", "clause_type"])
X = df["clean_text"].values
y = df["clause_type"].astype("category").cat.codes
class_map = dict(enumerate(df["clause_type"].astype("category").cat.categories))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TfidfVectorizer(max_features=50000),
    LogisticRegression(max_iter=1000, n_jobs=-1)
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on validation set
y_pred = pipeline.predict(X_val)

# Accuracy
acc = accuracy_score(y_val, y_pred)
print(f"Accuracy: {acc:.4f}")

# Classification report (precision, recall, f1-score)
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy: 0.8388
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        35
           2       0.88      0.80      0.84        35
           3       0.69      0.77      0.73        31
           4       0.72      0.67      0.69        27
           5       0.95      1.00      0.97        37
           6       0.77      0.82      0.80        40
           7       0.98      0.96      0.97        47
           8       0.92      0.92      0.92        36
           9       1.00      0.98      0.99        50
          10       0.80      0.57      0.67        49
          11       0.84      0.68      0.75        47
          12       0.67      0.78      0.72        37
          13       0.60      0.68      0.64        40
          14       0.61      0.53      0.56        38
          15       0.83      0.76      0.80        46
          16       0.77      0.61      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
y_pred

array([ 18,  33, 123, ..., 178, 296, 119], dtype=int16)

In [None]:
import joblib

# Save the entire pipeline
joblib.dump(pipeline, "logistic_tfidf_pipeline.pkl")

# Later, you can load it like this:
loaded_pipeline = joblib.load("logistic_tfidf_pipeline.pkl")

# Test prediction
y_pred_loaded = loaded_pipeline.predict(X_val)


In [None]:
def test_input(text):
    # Load model (optional if pipeline is already in memory)
    model = joblib.load("/content/logistic_tfidf_pipeline.pkl")
    pred = model.predict([text])[0]
    return pred

# Example test
new_text = "I really enjoy this"
print(f"Prediction for '{new_text}':", test_input(new_text))

Prediction for 'I really enjoy this': 258


In [None]:
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [None]:
def test_input(text):
    # Load model (pipeline includes TF-IDF + LogisticRegression)
    model = joblib.load("/content/logistic_tfidf_pipeline.pkl")
    pred_num = model.predict([text])[0]  # numeric prediction
    pred_label = class_map.get(pred_num, "Unknown Clause")  # map to readable label
    return pred_label

# Example test
new_text = "The agreement will be terminated if conditions are not met"
print(f"Prediction for '{new_text}':", test_input(new_text))

Prediction for 'The agreement will be terminated if conditions are not met': conditions


In [None]:
new_text2 = "I like the product and its features"
print(f"Prediction for '{new_text2}':", test_input(new_text2))

Prediction for 'I like the product and its features': scope


In [None]:
test_texts = [
    # Conditions
    "All payments must be completed within 30 days of receiving the invoice.",

    # Scope
    "This contract covers software development, testing, and deployment services.",

    # Obligations
    "The supplier shall provide technical support for all delivered products.",

    # Termination
    "Either party may terminate this agreement with 60 days written notice.",

    # Confidentiality
    "All proprietary information must be kept strictly confidential by both parties.",

    # Liabilities
    "The company shall not be liable for indirect or consequential damages."
]

for text in test_texts:
    print(f"Text: {text}\nPredicted clause: {test_input(text)}\n")


Text: All payments must be completed within 30 days of receiving the invoice.
Predicted clause: payments

Text: This contract covers software development, testing, and deployment services.
Predicted clause: services

Text: The supplier shall provide technical support for all delivered products.
Predicted clause: support

Text: Either party may terminate this agreement with 60 days written notice.
Predicted clause: termination-of-agreement

Text: All proprietary information must be kept strictly confidential by both parties.
Predicted clause: proprietary_rights

Text: The company shall not be liable for indirect or consequential damages.
Predicted clause: limitation-of-liability

