Two main types
*   *instruction fine-tuning*
*   *classification fine-tuning*




**Preparing dataset**

In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
def download_and_unzip_spam_data(
 url, zip_path, extracted_path, data_file_path):
 if data_file_path.exists():
  print(f"{data_file_path} already exists. Skipping download "
  "and extraction."
  )
  return

with urllib.request.urlopen(url) as response: #Download File
 with open(zip_path, "wb") as out_file:
  out_file.write(response.read())

 with zipfile.ZipFile(zip_path, "r") as zip_ref: # Unzips the file
  zip_ref.extractall(extracted_path)

 original_file_path = Path(extracted_path) / "SMSSpamCollection"
 os.rename(original_file_path, data_file_path)  #Adds a .tsv extension
 print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv
sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


Loading dataset into pandas dataframe

In [2]:
import pandas as pd
df = pd.read_csv(
 data_file_path, sep="\t", header=None, names=["Label", "Text"]
)

print(df)

     Label                                               Text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


**Class Label distribution**

In [3]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


Creating a balanced dataset

In [4]:
def create_balanced_dataset(df):
 num_spam = df[df["Label"] == "spam"].shape[0] # Counts spam instances
 ham_subset = df[df["Label"] == "ham"].sample(
 num_spam, random_state=123
 )

 balanced_df = pd.concat([
 ham_subset, df[df["Label"] == "spam"]
 ])
 return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


Converting string class labels into integer class labels (0 & 1)



In [5]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

Splitting dataset into Training, validation and testing

In [6]:
def random_split(df, train_frac, validation_frac):

 df = df.sample(
 frac=1, random_state=123
 ).reset_index(drop=True) #Shuffles the entire dataframe
 train_end = int(len(df) * train_frac) #Calculate split indices
 validation_end = train_end + int(len(df) * validation_frac)

 #Split dataframe
 train_df = df[:train_end]
 validation_df = df[train_end:validation_end]
 test_df = df[validation_end:]
 return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(
 balanced_df, 0.7, 0.1)

Save dataset as csv file

In [7]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)