In [None]:
!pip install transformers datasets
!pip install pandas


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [None]:
import pandas as pd
import csv

# Define the correct headers
correct_headers = [
    "size",
    "page_id",
    "domain",
    "category",
    "title",
    "description",
    "keywords"
]

# Function to read CSV line by line, discard problematic rows, and use the correct headers
def clean_csv(file_path):
    cleaned_data = []
    problematic_rows = []

    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the header from the file
        for i, row in enumerate(reader, start=2):  # Start counting from 2 because of the header
            if len(row) >= 7:  # Check if the row has at least 7 columns
                cleaned_data.append(row[:7])
            else:
                problematic_rows.append((i, row))

    # Convert cleaned data to DataFrame with correct headers
    cleaned_df = pd.DataFrame(cleaned_data, columns=correct_headers)
    return cleaned_df, problematic_rows

# Path to the dataset
data_table = '/content/train/training_data_en.csv'

# Clean the dataset
cleaned_df, problematic_rows = clean_csv(data_table)

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path = '/content/train/cleaned_training_data_en.csv'
cleaned_df.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned dataset
cleaned_df.head()


# Display problematic rows count
#print(f"Problematic rows: {len(problematic_rows)} rows discarded")

# Output the path to the cleaned file
#print(f"Cleaned CSV file saved to: {cleaned_file_path}")





In [None]:
# Print each header along with the first 5 rows of data
for column in cleaned_df.columns:
    print(f"Header: {column}")
    print(cleaned_df[column].head(5).to_list())
    print("\n")


Header: size
['886', '\\N', '0', '0', '0']


Header: page_id
['634043', '1265895', '6597113', '8429739', '7030283']


Header: domain
['www.tween2teenbooks.com', 'plannedspontaneityhiking.com', 'soccerplayer.net', 'www.utvdirect.com', 'www.debrovys.com']


Header: category
['/Books & Literature', '/Hobbies & Leisure/Outdoors/Hiking & Camping', '/Sports/Team Sports/Soccer', '/Autos & Vehicles/Motor Vehicles (By Type)/Off-Road Vehicles', '/Business & Industrial/Construction & Maintenance/Building Materials & Supplies']


Header: title
['tween 2 teen book reviews', 'planned spontaneity – a blog about hiking around the midwest and beyond', ' - soccerplayer.net', 'utv accessories & parts | find the best side by side accessories & parts at the best prices - utv direct', 'tarps, large canvas tarp, & canvas tarp manufacturers in the united states']


Header: description
['', 'a blog about hiking around the midwest and beyond', '', 'find the right side by side accessories for your utv at utv dir

In [None]:
# Filter out rows with short descriptions and save to a new CSV file
filtered_df = cleaned_df[cleaned_df['description'].apply(lambda x: len(str(x)) >= 4)]
filtered_file_path = '/content/train/cleaned_training_data_en_filtered.csv'
filtered_df.to_csv(filtered_file_path, index=False)

# Display the first few rows of the filtered dataset
filtered_df.head()

# Print the path to the filtered file
print(f"Filtered CSV file saved to: {filtered_file_path}")

# Count rows with missing or short 'description' and 'keywords' in filtered dataset
short_description = filtered_df['description'].apply(lambda x: len(str(x)) < 4).sum()
short_keywords = filtered_df['keywords'].apply(lambda x: len(str(x)) < 4).sum()

# Count rows with valid 'description' and 'keywords' in filtered dataset
valid_description = len(filtered_df) - short_description
valid_keywords = len(filtered_df) - short_keywords

# Print the number of rows with short or missing 'description' and 'keywords'
print(f"Number of rows with short or missing 'description' in filtered dataset: {short_description}")
print(f"Number of rows with short or missing 'keywords' in filtered dataset: {short_keywords}")

# Print the number of rows with valid 'description' and 'keywords'
print(f"Number of rows with valid 'description' in filtered dataset: {valid_description}")
print(f"Number of rows with valid 'keywords' in filtered dataset: {valid_keywords}")


Filtered CSV file saved to: /content/train/cleaned_training_data_en_filtered.csv
Number of rows with short or missing 'description' in filtered dataset: 0
Number of rows with short or missing 'keywords' in filtered dataset: 17557
Number of rows with valid 'description' in filtered dataset: 26765
Number of rows with valid 'keywords' in filtered dataset: 9208


In [None]:
!pip install transformers pandas scikit-learn torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Load the cleaned and filtered data
cleaned_file_path = '/content/train/cleaned_training_data_en_filtered.csv'
df = pd.read_csv(cleaned_file_path)

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Load the tokenizer
model_name = "distilbert-base-uncased"  # Use DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the data
def tokenize_data(data):
    return tokenizer(
        data['description'].tolist(),
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)

# Convert labels to integers
labels = df['category'].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
train_labels = train_df['category'].apply(lambda x: label_to_id[x]).tolist()
val_labels = val_df['category'].apply(lambda x: label_to_id[x]).tolist()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create custom datasets
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

#quantization yapilmadi --> runtime i dusuruyor

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()



In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 2.45242977142334, 'eval_runtime': 42.6567, 'eval_samples_per_second': 62.757, 'eval_steps_per_second': 3.938, 'epoch': 3.0}


In [None]:
model.save_pretrained('fine-tuned-model')
tokenizer.save_pretrained('fine-tuned-model')


('fine-tuned-model/tokenizer_config.json',
 'fine-tuned-model/special_tokens_map.json',
 'fine-tuned-model/vocab.txt',
 'fine-tuned-model/added_tokens.json',
 'fine-tuned-model/tokenizer.json')

In [None]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the model and move it to the appropriate device
model = DistilBertForSequenceClassification.from_pretrained('fine-tuned-model')
model.to(device)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('fine-tuned-model')

# Move model to evaluation mode
model.eval()

# Function to classify new descriptions
def classify_description(description):
    inputs = tokenizer(description, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to the same device as the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
        predicted_label_id = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = labels[predicted_label_id]
    return predicted_label

# Example usage
new_description = "I teach english "
predicted_category = classify_description(new_description)
print(f"Predicted category: {predicted_category}")



Predicted category: /Jobs & Education/Education
