# Assignment 2

In [11]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Download NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
def load_raw_data(file_path):
    """Load raw SMS data from file"""
    print(f"Loading raw data from {file_path}")
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    print(f"Loaded {len(df)} records")
    return df

def save_raw_data(df, path='raw_data.csv'):
    """Save raw data to CSV file and track with DVC"""
    print(f"Saving raw data to {path}")
    df.to_csv(path, index=False)
    print(f"Raw data saved {path}")

# Load and save raw data
filepath = 'sms_spam_collection/SMSSpamCollection'
sms_data = load_raw_data(filepath)
save_raw_data(sms_data)

# Track the raw data file with DVC
datapath = 'raw_data.csv'
!dvc init --subdir
!dvc add {datapath}

Loading raw data from sms_spam_collection/SMSSpamCollection
Loaded 5572 records
Saving raw data to raw_data.csv
Raw data saved raw_data.csv
Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>

To track the changes with git, run:

	git add raw_data.csv.dvc .gitignore

To enable auto staging, 

⠋ Checking graph



In [14]:
df = pd.read_csv('raw_data.csv')
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
def process_and_split_data(df, random_state=0):
    """Process and split data into train/validation/test sets"""
    print(f"Splitting data with random_state={random_state}")
    
    # Create directories if they don't exist
    os.makedirs('datasets_processed', exist_ok=True)
    
    # Split the data into train (70%), validation (15%), and test (15%) sets
    train_data, temp_data = train_test_split(
        df, 
        train_size=0.7, 
        random_state=random_state
    )
    
    val_data, test_data = train_test_split(
        temp_data, 
        test_size=0.5, 
        random_state=random_state
    )
    
    # Define paths for split files
    train_path = 'datasets_processed/train.csv'
    val_path = 'datasets_processed/validation.csv'
    test_path = 'datasets_processed/test.csv'
    
    # Save splits to CSV files
    train_data.to_csv(train_path, index=False)
    val_data.to_csv(val_path, index=False)
    test_data.to_csv(test_path, index=False)
    
    print(f"Train data shape: {train_data.shape}")
    print(f"Validation data shape: {val_data.shape}")
    print(f"Test data shape: {test_data.shape}")
    
    return train_path, val_path, test_path

train_path, val_path, test_path = process_and_split_data(df)

Splitting data with random_state=0
Train data shape: (3900, 2)
Validation data shape: (836, 2)
Test data shape: (836, 2)


In [21]:
# Track split files with DVC
!dvc add {train_path} {val_path} {test_path}


To track the changes with git, run:

	git add 'datasets_processed\test.csv.dvc' 'datasets_processed\.gitignore' 'datasets_processed\validation.csv.dvc' 'datasets_processed\train.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [22]:
def print_target_distribution(file_path):
    """Print distribution of target variable in a dataset"""
    df = pd.read_csv(file_path)
    counts = df['label'].value_counts()
    
    print(f"Dataset: {file_path}")
    print(f"  Ham (0): {counts.get(0, 0)}")
    print(f"  Spam (1): {counts.get(1, 0)}")
    print(f"  Total: {len(df)}")

print("\nDistribution of target variable in first version:")
print_target_distribution(train_path)
print_target_distribution(val_path)
print_target_distribution(test_path)


Distribution of target variable in first version:
Dataset: datasets_processed/train.csv
  Ham (0): 3374
  Spam (1): 526
  Total: 3900
Dataset: datasets_processed/validation.csv
  Ham (0): 725
  Spam (1): 111
  Total: 836
Dataset: datasets_processed/test.csv
  Ham (0): 726
  Spam (1): 110
  Total: 836


In [25]:
!git add .



In [26]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	new file:   .dvc/.gitignore
	new file:   .dvc/config
	new file:   .dvcignore
	new file:   .gitignore
	new file:   datasets_processed/.gitignore
	new file:   datasets_processed/test.csv.dvc
	new file:   datasets_processed/train.csv.dvc
	new file:   datasets_processed/validation.csv.dvc
	new file:   prepare.ipynb
	new file:   raw_data.csv.dvc
	new file:   sms_spam_collection/SMSSpamCollection
	new file:   sms_spam_collection/readme

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    ../../.dvc/.gitignore
	deleted:    ../../.dvc/config
	deleted:    ../../.dvcignore
	deleted:    ../.gitignore
	modified:   ../assignment_1/prepare.ipynb
	modified:   prepare.ipynb
	deleted:    ../datasets_processed/.gitignore



In [27]:
!git commit -m "Added dataset with DVC"

[main 34160df] Added dataset with DVC
 12 files changed, 6257 insertions(+)
 create mode 100644 assignments/assignment_2/.dvc/.gitignore
 create mode 100644 assignments/assignment_2/.dvc/config
 create mode 100644 assignments/assignment_2/.dvcignore
 create mode 100644 assignments/assignment_2/.gitignore
 create mode 100644 assignments/assignment_2/datasets_processed/.gitignore
 create mode 100644 assignments/assignment_2/datasets_processed/test.csv.dvc
 create mode 100644 assignments/assignment_2/datasets_processed/train.csv.dvc
 create mode 100644 assignments/assignment_2/datasets_processed/validation.csv.dvc
 create mode 100644 assignments/assignment_2/prepare.ipynb
 create mode 100644 assignments/assignment_2/raw_data.csv.dvc
 create mode 100644 assignments/assignment_2/sms_spam_collection/SMSSpamCollection
 create mode 100644 assignments/assignment_2/sms_spam_collection/readme
