In [None]:
import zipfile
import os

zip_file_path = '/content/archive (3).zip'
destination_folder = '/content/unzipped_files'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

print(f"File '{zip_file_path}' unzipped to '{destination_folder}'")

File '/content/archive (3).zip' unzipped to '/content/unzipped_files'


In [None]:
import pandas as pd
df = pd.read_csv("legal_contract_clauses.csv")
df = df.dropna(subset=["clause_text", "risk_level"])


In [None]:
df

Unnamed: 0,clause_text,clause_type,risk_level
0,Electric City of Illinois L.L.C.,Parties,low
1,The term of this Agreement shall be ten (10)...,Effective Date,low
2,Unless earlier terminated otherwise prov...,Effective Date,high
3,If Distributor comp...,Renewal Term,low
4,This Agreement is to be construed according to...,Governing Law,low
...,...,...,...
9442,Said books and records shall be maintained for...,Post-Termination Services,high
9443,Company shall make said books available to Nor...,Audit Rights,medium
9444,"Company agrees, at its own expense, to obtain ...",Insurance,high
9445,Such insurance policy shall be maintained wit...,Insurance,medium


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['clause_text'], df['risk_level'], test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)


In [None]:
y_pred = clf.predict(X_test_vec)


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        high       0.90      0.89      0.89       861
         low       0.93      0.80      0.86       375
      medium       0.83      0.91      0.87       654

    accuracy                           0.88      1890
   macro avg       0.89      0.87      0.87      1890
weighted avg       0.88      0.88      0.88      1890



In [None]:
import pickle

# Save the trained model to a file
filename = 'logistic_reg_risk.pkl'
pickle.dump(clf, open(filename, 'wb'))

print(f"Model saved to {filename}")

Model saved to logistic_reg_risk.pkl


In [None]:
!pip install shap



In [None]:
import shap,joblib
explainer = shap.Explainer(clf, X_train_vec[:100])
joblib.dump(explainer, "shap_explainer.pkl")
print("✅ SHAP explainer saved.")

✅ SHAP explainer saved.


In [None]:
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")



['tfidf_vectorizer.pkl']