# **Dataset Preparation** 


# Zephyr-7b-beta


## Instruction Prompt Template - Dataset from meta_data folder

In [None]:
import pandas as pd # type: ignore

dataset_path = "./data/meta_data/data.csv"

prompt_template = """<|system|>\n You are a safe, ethical, helpful, knowledgeable AI assistant and customer support expert specialising in cyber security, cloud computing and IT technical Support domains. Your primary job is to deliver detailed responses to customer questions in these domains. Drawing on your extensive expertise, adhere to the following guidelines:\n

1. Provide concise, accurate, and helpful answers to these questions, typically ranging from 450 - 500 words, depending on the complexity of the question.\n

2. Enhance readability by using appropriate formatting such as bullet points, short paragraphs, or numbered lists when applicable.\n

3. Prioritize customer satisfaction while maintaining an empathetic, human and professional tone throughout interactions.\n

4. Provide troubleshooting steps in a clear, organised and logical order when applicable.\n

5. Avoid providing information that could be harmful, biased, misused or leading to security risks or data loss.\n 

</s>\n<|user|>\n Question: {Question}\n

Answer: </s>\n<|assistant|>\n

"""
output_column_name = "Answer"

df = pd.read_csv(dataset_path)


In [None]:
df.head(5)


## meta_data/data - 80% train and 20% Evaluation

### Utility function for mapping from existing split columns (0,1,2) to new "train" and "evaluation".
### For 0 is mapped to "Train"  and 1 & 2 is mapped to "Evaluation" 

In [None]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

def map_split(value):
    if value == 'train':
        return 'train'
    else:
        return 'evaluation'

# Read the CSV file
#df = pd.read_csv('your_file.csv')

# Check if there's an existing split column
if 'existing_split' in df.columns:
    # Map the existing split column to the new 'split' column
    df['split'] = df['existing_split'].apply(map_split)
else:
    # Create a new 'split' column with 70-30 split
    df['split'] = np.random.choice(['train', 'evaluation'], size=len(df), p=[0.8, 0.2])

# Save the updated DataFrame back to CSV
df.to_csv('./data/Zephyr/data.csv', index=False)
print("Dataset created successfully")


In [None]:
# Calculate the split values in counts and percentages
counts = df['split'].value_counts()
percentages = df['split'].value_counts(normalize=True)

# Combine counts and percentages into a single DataFrame
result = pd.DataFrame({'Count': counts, 'Percentage': percentages})

# Format the percentage as a string with two decimal places
result['Percentage'] = result['Percentage'].apply(lambda x: f"{x:.3%}")

# Print the result
print(result)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style
sns.set_style("whitegrid")

# Read the CSV file
df = pd.read_csv('./data/Zephyr/data.csv')

# Create a figure with subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Define a color palette
color_palette = sns.color_palette("colorblind")[:2]  # Get the first two colors from the colorblind palette

# Get the split counts
split_counts = df['split'].value_counts()

# 1. Pie chart
wedges, texts, autotexts = ax1.pie(split_counts, labels=split_counts.index, autopct='%1.1f%%', 
                                   startangle=90, colors=color_palette)
ax1.set_title('Distribution of Train and Evaluation Sets')

# Add count numbers to pie chart
for i, autotext in enumerate(autotexts):
    autotext.set_text(f'{split_counts[i]} ({autotext.get_text()})')

# 2. Bar plot
sns.countplot(x='split', data=df, ax=ax2, palette=dict(zip(split_counts.index, color_palette)))
ax2.set_title('Count of Samples in Train and Evaluation Sets')
ax2.set_ylabel('Number of Samples')

# Add count numbers on top of each bar
for i, p in enumerate(ax2.patches):
    height = p.get_height()
    ax2.text(p.get_x() + p.get_width()/2., height + 0.1,
             f'{height}',
             ha="center", va="bottom")

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


In [None]:
new_df = pd.DataFrame()
prompts = []

for index, row in df.iterrows():
    prompt = prompt_template.format(**row.to_dict())
    prompts.append(prompt)

new_df["prompt"] = prompts
new_df["completion"] = df[output_column_name]

if "split" in df.columns:
    # Modify this line
    new_df['split'] = df['split'].apply(map_split)

print(new_df.head())


## Dataset saved

In [None]:
new_df.to_csv("./data/Zephyr/data.csv", index=False)
