In [1]:
!pip install pandas numpy scikit-learn dvc[gs] mlflow

Collecting dvc[gs]
  Downloading dvc-3.59.1-py3-none-any.whl (457 kB)
     ------------------------------------ 457.7/457.7 kB 251.2 kB/s eta 0:00:00
Collecting mlflow
  Downloading mlflow-2.20.3-py3-none-any.whl (28.4 MB)
     -------------------------------------- 28.4/28.4 MB 433.5 kB/s eta 0:00:00
Collecting gto<2,>=1.6.0
  Downloading gto-1.7.2-py3-none-any.whl (45 kB)
     -------------------------------------- 45.2/45.2 kB 249.9 kB/s eta 0:00:00
Collecting zc.lockfile>=1.2.1
  Downloading zc.lockfile-3.0.post1-py3-none-any.whl (9.8 kB)
Collecting celery
  Downloading celery-5.4.0-py3-none-any.whl (425 kB)
     ------------------------------------ 426.0/426.0 kB 542.7 kB/s eta 0:00:00
Collecting dpath<3,>=2.1.0
  Downloading dpath-2.2.0-py3-none-any.whl (17 kB)
Collecting configobj>=5.0.9
  Downloading configobj-5.0.9-py2.py3-none-any.whl (35 kB)
Collecting dvc-objects
  Downloading dvc_objects-5.1.0-py3-none-any.whl (33 kB)
Collecting funcy>=1.14
  Downloading funcy-2.0-py2.py3-

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-cloud-auth 0.1.4 requires pydantic<2.0, but you have pydantic 2.7.1 which is incompatible.
anaconda-cloud-auth 0.1.4 requires semver<3, but you have semver 3.0.2 which is incompatible.


In [4]:
!dvc init --no-scm

Initialized DVC repository.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


In [5]:
!dvc remote add -d myremote gdrive://1xxxd-5UG6OEYvQZriN7-qTKCUo2s-AXI
!dvc remote modify myremote gdrive_use_service_account true

Setting 'myremote' as a default remote.


In [1]:
import os
import csv
import pandas as pd
import dvc.api
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import average_precision_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
# Ensuring NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Nirjhar
[nltk_data]     Nath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Nirjhar
[nltk_data]     Nath\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Nirjhar
[nltk_data]     Nath\AppData\Roaming\nltk_data...


True

In [2]:
# Set up directories
data_dir = "./data"
os.makedirs(data_dir, exist_ok=True)

In [3]:
# Load data
def load_data(filepath):
    return pd.read_csv(filepath, sep='\t', quoting=csv.QUOTE_NONE, names=['label', 'message'])

In [12]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Return empty string for invalid input
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in word_tokenize(text) if token.lower() not in stop_words]
    return " ".join(tokens)

In [5]:
# Encode labels
def encode_label(text):
    return 1 if text == 'spam' else 0

In [6]:
# Split data
def split_data(df, seed, train_size=0.7, val_size=0.15, test_size=0.15):
    train_df, val_test_df = train_test_split(df, test_size=1-train_size, random_state=seed)
    val_df, test_df = train_test_split(val_test_df, test_size=test_size/(1-train_size), random_state=seed)
    return train_df, val_df, test_df

In [7]:
# Store CSV
def store_as_csv(df, name):
    df.to_csv(name, index=False)

In [8]:
# Print label distribution
def print_variable_distribution(df, name):
    print(f"{name} - 0s: {len(df) - df.label.sum()}, 1s: {df.label.sum()}")

In [9]:
# Initialize DVC
os.system("dvc init --subdir -f")

1

In [10]:
# Track with DVC and Google Drive
os.system("dvc remote add --default myremote gdrive://1xxxd-5UG6OEYvQZriN7-qTKCUo2s-AXI")
os.system("dvc remote modify myremote gdrive_acknowledge_abuse true")
os.system("git add .dvc/config -f")

1

In [27]:
messages = pd.read_csv("data/raw_data.csv", delimiter=",")
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
messages['label'] = messages['label'].apply(encode_label)

In [36]:
messages.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
store_as_csv(messages, "./data/raw_data.csv")

In [31]:
# Add to DVC
os.system("dvc add data/raw_data.csv")
os.system("git commit -m 'Added raw_data.csv'")

1

In [33]:
# Train-validation-test split (SEED1)
SEED1 = 2032
train_df, val_df, test_df = split_data(messages, seed=SEED1)
store_as_csv(train_df, "./data/train.csv")
store_as_csv(val_df, "./data/validation.csv")
store_as_csv(test_df, "./data/test.csv")

os.system("dvc add data/train.csv data/validation.csv data/test.csv")
os.system("git commit -m 'Added train, validation, test data for SEED1'")

1

In [34]:
# Train-validation-test split (SEED2)
SEED2 = 2001
train_df, val_df, test_df = split_data(messages, seed=SEED2)
store_as_csv(train_df, "./data/train.csv")
store_as_csv(val_df, "./data/validation.csv")
store_as_csv(test_df, "./data/test.csv")

os.system("dvc add data/train.csv data/validation.csv data/test.csv")
os.system("git commit -m 'Added train, validation, test data for SEED2'")

1

In [35]:
# Checkout first version and print distributions
os.system("git checkout HEAD~1 data/train.csv.dvc data/validation.csv.dvc data/test.csv.dvc")
os.system("dvc pull")

251