In [11]:
import os
import json
import pandas as pd
import ast

class PREPARE_DATASET():
    def __init__(
        self,
        data_file_path: str
    ) -> None:
        self.data_file_path = data_file_path
        self.out_file = os.path.join(os.getcwd(), 'processed_seed_tasks.jsonl')

        self.data = []
        self.processed_data = []

    
    def load_data(
        self
    ):
        f = open(self.data_file_path)
        self.data = [json.loads(line) for line in f]
        f.close()
        print(f'Total samples: {len(self.data)}')
    

    def process_data(
        self
    ):
        for line in self.data[:2]:
            print(line)
            # self.processed_data.append(line)
    
    
    def load_data_in_df(
        self
    ):
        self.data = pd.read_json(self.data_file_path, lines=True)
        print(f'Total samples: {self.data.shape}')

    
    def process_data_in_df(
        self
    ):
        self.data['category'] = self.data.is_classification.apply(lambda x: 'classification' if x==True else 'other')
        self.data = self.data[['instruction', 'instances', 'category']]
        # Get Context and Response from input
        self.data['context'] = self.data.instances.apply(
            lambda x: ast.literal_eval(str(x))[0]['input']
        )
        self.data['response'] = self.data.instances.apply(
            lambda x: ast.literal_eval(str(x))[0]['output']
        )
        
        # Select required columns
        self.data = self.data[['instruction', 'context', 'response', 'category']]

    
    def write_data_in_json(
        self
    ):
        self.data.to_json(self.out_file, orient='records', lines=True)

In [12]:
prepareDatasetObj = PREPARE_DATASET(data_file_path='./seed_tasks.jsonl')
prepareDatasetObj.load_data_in_df()
prepareDatasetObj.process_data_in_df()
prepareDatasetObj.write_data_in_json()

Total samples: (175, 5)


In [13]:
prepareDatasetObj.load_data_in_df()

Total samples: (175, 5)


In [14]:
prepareDatasetObj.data.head()

Unnamed: 0,id,name,instruction,instances,is_classification
0,seed_task_0,breakfast_suggestion,Is there anything I can eat for a breakfast th...,"[{'input': '', 'output': 'Yes, you can have 1 ...",False
1,seed_task_1,antonym_relation,What is the relation between the given pairs?,"[{'input': 'Night : Day :: Right : Left', 'out...",False
2,seed_task_2,one_sentence_description,Generate a one-sentence description for each o...,[{'input': '- Brack Obama - Elon Musk - Taylor...,False
3,seed_task_3,harmful_stereotype_example,Describe a situation in which the given stereo...,"[{'input': '""All Asians are smart!""', 'output'...",False
4,seed_task_4,email_subject_generation,Generate an appropriate subjective title for t...,"[{'input': 'Hi [person name], I'm writing to ...",False


In [16]:
prepareDatasetObj.data[prepareDatasetObj.data.is_classification == False]

Unnamed: 0,id,name,instruction,instances,is_classification
0,seed_task_0,breakfast_suggestion,Is there anything I can eat for a breakfast th...,"[{'input': '', 'output': 'Yes, you can have 1 ...",False
1,seed_task_1,antonym_relation,What is the relation between the given pairs?,"[{'input': 'Night : Day :: Right : Left', 'out...",False
2,seed_task_2,one_sentence_description,Generate a one-sentence description for each o...,[{'input': '- Brack Obama - Elon Musk - Taylor...,False
3,seed_task_3,harmful_stereotype_example,Describe a situation in which the given stereo...,"[{'input': '""All Asians are smart!""', 'output'...",False
4,seed_task_4,email_subject_generation,Generate an appropriate subjective title for t...,"[{'input': 'Hi [person name], I'm writing to ...",False
...,...,...,...,...,...
144,seed_task_144,solve_equation_system,Solve the following equation system. Give me t...,"[{'input': '3x - 4y = 1, 2x + 3y = 12', 'outpu...",False
145,seed_task_145,plan_syllabus,Plan a syllabus for the the class.,"[{'input': 'Class: NLP for PhD students.', 'ou...",False
146,seed_task_146,university_ranking,Rank the following universities.,"[{'input': 'Princeton, Stanford, UW, Cornell',...",False
147,seed_task_147,rank_countries_by_population,Rank these countries by their population.,"[{'input': 'Brazil, China, US, Japan, Canada, ...",False


In [19]:
df = pd.read_json('/Users/chintandonda/Downloads/databricks-dolly-15k.jsonl', lines=True)

In [20]:
df.head()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa


In [21]:
df.category.unique()

array(['closed_qa', 'classification', 'open_qa', 'information_extraction',
       'brainstorming', 'general_qa', 'summarization', 'creative_writing'],
      dtype=object)

In [28]:
df.groupby(['category']).count()

Unnamed: 0_level_0,instruction,context,response
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
brainstorming,1766,1766,1766
classification,2136,2136,2136
closed_qa,1773,1773,1773
creative_writing,709,709,709
general_qa,2191,2191,2191
information_extraction,1506,1506,1506
open_qa,3742,3742,3742
summarization,1188,1188,1188


In [47]:
df[df.category == 'classification'].head()

Unnamed: 0,instruction,context,response,category
1,Which is a species of fish? Tope or Rope,,Tope,classification
19,Identify which instrument is string or percuss...,,"Gudok is string, Cantaro is percussion.",classification
