In [1]:
from helpers.models import Models
from helpers.llm_client import LLMClient
from helpers.functions import *
from helpers.prompts import *
import pandas as pd
import re
import os

# pd.set_option('display.max_rows', None)    # Show all rows
# pd.set_option('display.max_colwidth', None)  # Show full column width

In [2]:
# If the keys are not present, add these lines to ~/.bashrc
# sudo gedit ~/.bashrc
# ! Add your own API keys here, currently all 3 give a level of free access with GRoQ being the most free
# export GROQ_API_KEY = ""
# export NVIDIA_API_KEY = ""
# export TOGETHER_API_KEY = ""

In [3]:
# Check to make sure that all API keys are present
os.environ['GROQ_API_KEY'] 
os.environ['NVIDIA_API_KEY']
os.environ['TOGETHER_API_KEY']    
'OK'
#

'OK'

### Semantic Search

In [4]:
df = pd.read_csv('extras/nomos_test_dataset.csv', index_col = 0)
df.head(5)

Unnamed: 0,file_path,license
0,extras/NomosTestfiles/AAL/AAL.txt,AAL
1,extras/NomosTestfiles/AAL/LICENSE,AAL
2,extras/NomosTestfiles/Abstyles/Abstyles.txt,Abstyles
3,extras/NomosTestfiles/ACAA/c32001a.ada,"Govt-rights,UnclassifiedLicense"
4,extras/NomosTestfiles/ACE/ACE-copying.html,ACE


In [None]:
from helpers.functions import scan, extract_comments
df2 = df.loc[0:10]
df2 = extract_comments(df2)
df2.head(5)

In [6]:
scan(df2.loc[0, 'file_comments'], pd.read_csv('extras/processedLicenses.csv'))

[{'shortname': 'AAL',
  'sim_score': 96.0,
  'sim_type': 'SemanticSearch-LVD',
  'description': ''}]

## LLMs

In [5]:
client = LLMClient()

### LLM for license matching

Single Inference

In [9]:
response = client._infer(model = Models.GEMMA_2_9b, prompt = prompt_for_one_stage_license_matching(df2.loc[0, 'file_comments']), temperature = 0)
print(response)

*   **Licenses = [Attribution Assurance License]**
*   **SPDX-IDs = []** 

While the text describes the "Attribution Assurance License", it doesn't provide a formal SPDX identifier for it. 



Dataset Inference

In [6]:
# Requires that the dataframe contains the comments in the 'text' column
df2 = df2.rename(columns={'file_comments': 'text'})

In [8]:
df2_result = client.process_dataset(
                                    df = df2,
                                    model = Models.GEMMA_2_9b,
                                    prompt_function = prompt_for_one_stage_license_matching,
                                    output_name = 'nomos_test_10_samples_license_matching',
                                    log_every = 2,
                                )

[32m2024-09-09 10:28:29.563[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m330[0m - [1mProcessing index: 0[0m
[32m2024-09-09 10:28:30.459[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m330[0m - [1mProcessing index: 2[0m
[32m2024-09-09 10:28:31.664[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m330[0m - [1mProcessing index: 4[0m
[32m2024-09-09 10:28:32.892[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m330[0m - [1mProcessing index: 6[0m
[32m2024-09-09 10:28:35.349[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m330[0m - [1mProcessing index: 8[0m
[32m2024-09-09 10:28:44.261[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m330[0m - [1mProcessing index: 10[0m


In [14]:
pd.set_option('display.max_colwidth', 100)  # Show full column width
df2_result['response'].loc[0:5]

0    *   **Licenses = [Attribution Assurance License]**\n*   **SPDX-IDs = []** \n\nWhile the text des...
1    *   **Licenses = [ATTRIBUTION ASSURANCE LICENSE]**\n*   **SPDX-IDs = []** \n\n\nWhile the text d...
2    *   **Licenses = [Permission Notice]**\n*   **SPDX-IDs = []** \n\n**Explanation:**\n\nWhile the ...
3    *   **Licenses = ["U.S. Government Unlimited Rights"]**\n*   **SPDX-IDs = ["USGovUnlimitedRights...
4    *   **Licenses = [Permissive License]**\n*   **SPDX-IDs = [MIT License]** \n\n**Evidence:**\n\nT...
5    *   **Licenses = [Adobe Systems Incorporated(r) Source Code License Agreement]**\n*   **SPDX-IDs...
Name: response, dtype: object

### License Text Identification

In [16]:
df3 = df.copy(deep=True).loc[0:10]

df3_result = client.process_dataset_for_license_text_identification(
                                    df = df3,
                                    model = Models.GEMMA_2_9b,
                                    # defaults to this if not specified
                                    prompt_function = prompt_for_license_text_identification, 
                                    output_name = 'nomos_test_10_samples_license_text_identification',
                                    log_every = 2,
                                )

[32m2024-09-09 10:33:36.511[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_license_text_identification[0m:[36m248[0m - [1mProcessing index: 0[0m
[32m2024-09-09 10:33:39.481[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_license_text_identification[0m:[36m248[0m - [1mProcessing index: 2[0m
[32m2024-09-09 10:33:41.283[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_license_text_identification[0m:[36m248[0m - [1mProcessing index: 4[0m
[32m2024-09-09 10:33:43.475[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_license_text_identification[0m:[36m248[0m - [1mProcessing index: 6[0m
[32m2024-09-09 10:33:45.216[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_license_text_identification[0m:[36m248[0m - [1mProcessing index: 8[0m
[32m2024-09-09 10:33:48.391[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mpro

In [18]:
print(df3.loc[0, 'response'])

```
ATTRIBUTION ASSURANCE LICENSE (adapted from the original BSD license)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the conditions below are met.
These conditions require a modest attribution to <AUTHOR> (the
"Author"), who hopes that its promotional value may help justify the
thousands of dollars in otherwise billable time invested in writing
this and other freely available, open-source software.

1. Redistributions of source code, in whole or part and with or without
modification (the "Code"), must prominently display this GPG-signed
text in verifiable form.
2. Redistributions of the Code in binary form must be accompanied by
this GPG-signed text in any documentation and, each time the resulting
executable program or a program dependent thereon is launched, a
prominent display (e.g., splash screen or banner text) of the Author's
attribution information, which includes:
(a) Name ("AUTHOR"),
(b) Professional identificat

In [25]:
# Simple parser to extract the license text from between the (```) (```) symbols

def parse_response(response):
    try:
        pattern = r"```.*```"
        matches = re.findall(pattern, response, re.DOTALL)
        return matches[0][3:-3]
    except:
        return '' 

df3 = df3[df3['response'].notna()]

df3['extracted_response'] = df3['response'].apply(lambda x: parse_response(x))

print(df3.loc[0, 'extracted_response'])


ATTRIBUTION ASSURANCE LICENSE (adapted from the original BSD license)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the conditions below are met.
These conditions require a modest attribution to <AUTHOR> (the
"Author"), who hopes that its promotional value may help justify the
thousands of dollars in otherwise billable time invested in writing
this and other freely available, open-source software.

1. Redistributions of source code, in whole or part and with or without
modification (the "Code"), must prominently display this GPG-signed
text in verifiable form.
2. Redistributions of the Code in binary form must be accompanied by
this GPG-signed text in any documentation and, each time the resulting
executable program or a program dependent thereon is launched, a
prominent display (e.g., splash screen or banner text) of the Author's
attribution information, which includes:
(a) Name ("AUTHOR"),
(b) Professional identification

### Obligation Clause Verification

In [2]:
obligation_dataset = pd.read_csv('extras/obligation_clause_verification_dataset.csv', index_col = 0)
obligation_dataset.head(5) 

Unnamed: 0,License Name,License ID,License Text,Obligations,response
0,Universal Permissive License v1.0,UPL-1.0,Copyright (c) [year] [copyright holders]\n\nTh...,USE CASE Source code delivery OR Binary delive...,Here is the evaluation of each clause against ...
1,Common Development and Distribution License 1.0,CDDL-1.0,COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (C...,USE CASE Source code delivery\n\tYOU MUST Prov...,Here is the evaluation of each clause against ...
2,Zope Public License 2.0,ZPL-2.0,Zope Public License (ZPL) Version 2.0\n\nThis ...,USE CASE Source code delivery\n\tYOU MUST Forw...,Here is the evaluation of each clause within t...
3,W3C Software Notice and Document License (2015...,W3C-20150513,This work is being provided by the copyright h...,USE CASE Source code delivery OR Binary delive...,Here is the evaluation of each clause against ...
4,IBM Public License v1.0,IPL-1.0,IBM Public License Version 1.0\n\nTHE ACCOMPAN...,USE CASE Source code delivery\n\tYOU MUST Prov...,Here is the evaluation of each clause against ...


In [21]:
# Already contains responses from a previous run, let's drop them
obligation_dataset.drop(columns=['response'], inplace=True)
obligation_dataset.head(5)

Unnamed: 0,License Name,License ID,License Text,Obligations
0,Universal Permissive License v1.0,UPL-1.0,"Copyright (c) [year] [copyright holders]\n\nThe Universal Permissive License (UPL), Version 1.0\...",USE CASE Source code delivery OR Binary delivery\n\tYOU MUST Provide Copyright notices\n\tEITHER...
1,Common Development and Distribution License 1.0,CDDL-1.0,COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL)\nVersion 1.0\n\n1. Definitions.\n\n1.1. “Cont...,USE CASE Source code delivery\n\tYOU MUST Provide License text\n\tYOU MUST NOT Modify Copyright ...
2,Zope Public License 2.0,ZPL-2.0,Zope Public License (ZPL) Version 2.0\n\nThis software is Copyright (c) Zope Corporation (tm) an...,USE CASE Source code delivery\n\tYOU MUST Forward Copyright notices\n\tYOU MUST Forward License ...
3,W3C Software Notice and Document License (2015-05-13),W3C-20150513,This work is being provided by the copyright holders under the following license.\n\nLicense\nBy...,USE CASE Source code delivery OR Binary delivery\n\tYOU MUST Provide License text\n\t\tATTRIBUTE...
4,IBM Public License v1.0,IPL-1.0,IBM Public License Version 1.0\n\nTHE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS I...,"USE CASE Source code delivery\n\tYOU MUST Provide License text\n\tYOU MUST Credit Verbatim ""Copy..."


In [23]:
obligation_dataset_result = client.process_dataset_for_obligation_clause_verification(
                                    df = obligation_dataset.loc[0:10],
                                    model = Models.GEMMA_2_9b,
                                    # defaults to this if not specified
                                    prompt_function = prompt_for_obligation_clause_verification, 
                                    output_name = 'obligation_dataset_test_sample',
                                    log_every = 1,
                                )

[32m2024-09-09 10:38:01.318[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_obligation_clause_verification[0m:[36m415[0m - [1mProcessing index: 0[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, 'response'] = self._infer(model, prompt, temperature)
[32m2024-09-09 10:38:02.245[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_obligation_clause_verification[0m:[36m415[0m - [1mProcessing index: 1[0m
[32m2024-09-09 10:38:09.922[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_obligation_clause_verification[0m:[36m415[0m - [1mProcessing index: 2[0m
[32m2024-09-09 10:38:11.256[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset_for_oblig

In [24]:
print(obligation_dataset_result.loc[0, 'response'])

Clause: YOU MUST Provide Copyright notices
Result: valid
Explanation: The license text states: "The above copyright notice and either this complete permission notice or at a minimum a reference to the UPL must be included in all copies or substantial portions of the Software."

Clause: YOU MUST Provide License text
Result: partially valid
Explanation: The license text requires either the complete permission notice or a reference to the UPL. Providing the full license text is one way to fulfill this requirement.

Clause: YOU MUST Reference License text
Result: partially valid
Explanation: The license text requires either the complete permission notice or a reference to the UPL. Referencing the UPL is another way to fulfill this requirement. 





### License Compatibility through Obligations

In [3]:
# Create a smaller dataset for analysis
obligation_dataset_2 = obligation_dataset.copy(deep=True).loc[0:9]

df_even = obligation_dataset_2.iloc[::2].copy()  
df_odd = obligation_dataset_2.iloc[1::2].copy()  
df_even.reset_index(drop=True, inplace=True)
df_odd.reset_index(drop=True, inplace=True)
df_even = df_even.add_suffix('_a')
df_odd = df_odd.add_suffix('_b')
df_combined = pd.concat([df_even, df_odd], axis=1)
df_combined.fillna('', inplace=True)
df_combined.head(5)

Unnamed: 0,License Name_a,License ID_a,License Text_a,Obligations_a,response_a,License Name_b,License ID_b,License Text_b,Obligations_b,response_b
0,Universal Permissive License v1.0,UPL-1.0,Copyright (c) [year] [copyright holders]\n\nTh...,USE CASE Source code delivery OR Binary delive...,Here is the evaluation of each clause against ...,Common Development and Distribution License 1.0,CDDL-1.0,COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (C...,USE CASE Source code delivery\n\tYOU MUST Prov...,Here is the evaluation of each clause against ...
1,Zope Public License 2.0,ZPL-2.0,Zope Public License (ZPL) Version 2.0\n\nThis ...,USE CASE Source code delivery\n\tYOU MUST Forw...,Here is the evaluation of each clause within t...,W3C Software Notice and Document License (2015...,W3C-20150513,This work is being provided by the copyright h...,USE CASE Source code delivery OR Binary delive...,Here is the evaluation of each clause against ...
2,IBM Public License v1.0,IPL-1.0,IBM Public License Version 1.0\n\nTHE ACCOMPAN...,USE CASE Source code delivery\n\tYOU MUST Prov...,Here is the evaluation of each clause against ...,Open Software License 3.0,OSL-3.0,Open Software License v. 3.0 (OSL-3.0)\n\nThis...,USE CASE Source code delivery\n\tYOU MUST Refe...,Here is the evaluation of each clause within t...
3,GNU General Public License v1.0 or later,GPL-1.0-or-later,"GNU GENERAL PUBLIC LICENSE\nVersion 1, Februar...",USE CASE Source code delivery\n\tYOU MUST Prov...,Here is the evaluation of each clause against ...,X11 License,X11,X11 License\n\nCopyright (C) 1996 X Consortium...,USE CASE Source code delivery OR Binary delive...,Here is the evaluation of each clause against ...
4,Common Public License 1.0,CPL-1.0,Common Public License Version 1.0\n\nTHE ACCOM...,USE CASE Source code delivery\n\tYOU MUST Prov...,Here is the evaluation of each clause against ...,FSF Unlimited License (With License Retention ...,FSFULLRWD,"Copyright (C) 1994, 1995, 1996, 1997, 1998, 19...",USE CASE Source code delivery OR Binary delive...,Here is the analysis of the obligations agains...


In [None]:
response = client._infer(
                model = Models.GEMMA_2_9b,
                prompt = prompt_for_license_compatibility_through_obligations(
                                                                    df_combined.loc[0, 'Obligations_a'],
                                                                    df_combined.loc[0, 'Obligations_b']
                                                                ),
                                                            ),

In [12]:
print(response[0])

## License Obligation Analysis:

Here's a breakdown of the obligations imposed by each license, followed by an overall verdict:

**License Obligations A:**

* **Attribution:** Requires providing copyright notices and either the full license text or a reference to it. No specific format restrictions are mentioned.
* **Copyleft/ShareAlike:**  No copyleft or ShareAlike provisions are present.
* **Modification and Distribution:** Allows modification of the software. No specific conditions or restrictions on distribution are mentioned. Source code disclosure for modifications is not required.
* **Commercial Use:**  No explicit restrictions on commercial use.
* **Patent Grants:**  No patent grants or licenses are mentioned.
* **Liability and Warranty Disclaimers:**  No disclaimers of liability or warranty are mentioned.
* **Additional Considerations:**  The license is very permissive, focusing primarily on attribution.

**License Obligations B:**

* **Attribution:** Requires providing the or