In [48]:
import os
from sklearn.model_selection import train_test_split

texts = []
labels = []

base_dir = "./dataset"
categories = ["productive", "unproductive"]

for idx, category in enumerate(categories):
    folder = os.path.join(base_dir, category)
    for fname in os.listdir(folder):
        path = os.path.join(folder, fname)
        with open(path, "r", encoding="utf-8") as f:
            text = f.read().strip()
            if len(text.split()) >= 50:  # filter tiny articles
                texts.append(text)
                labels.append(idx)  # 0 = productive, 1 = unproductive

# Combine texts and labels
data = list(zip(texts, labels))

# Remove exact duplicates
data = list(dict.fromkeys(data))  # preserves order, removes duplicates

print(f"Total samples after removing duplicates: {len(data)}")


# Split
import random
random.seed(42)
random.shuffle(data)

split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

X_train, y_train = zip(*train_data)
X_val, y_val = zip(*val_data)

# Check again for overlap
overlap = set(X_train) & set(X_val)
print(f"Overlapping samples: {len(overlap)}")
assert len(overlap) == 0, "Duplicates detected between train and validation!"



Total samples after removing duplicates: 3075
Overlapping samples: 0


In [49]:
from collections import Counter
print("Train distribution:", Counter(y_train))
print("Validation distribution:", Counter(y_val))


Train distribution: Counter({1: 1279, 0: 1181})
Validation distribution: Counter({0: 310, 1: 305})


In [50]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)


In [51]:
import torch

class ProductivityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = ProductivityDataset(train_encodings, y_train)
val_dataset = ProductivityDataset(val_encodings, y_val)


In [52]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # productive / unproductive
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


In [54]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [55]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.028236,0.993496,0.993443,0.993443,0.993443
2,No log,0.025039,0.995122,0.993464,0.996721,0.99509
3,No log,0.021549,0.996748,0.996721,0.996721,0.996721


RuntimeError: [enforce fail at inline_container.cc:664] . unexpected pos 28800 vs 28692

In [56]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Path to your trained weights
model_path = r"C:\my_notebook\eda\results\checkpoint-462"

tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()  # set to evaluation mode


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [73]:
text = """

Learn Exploratory Data Analysis (EDA) from Scratch | EDA in 5 hours | Satyajit Pattnaik

Satyajit Pattnaik
106k subscribers

Subscribe

6k


Share

Ask

Download

215,099 views  Premiered on 17 Sept 2023
Become a Data Analyst now!!

Explore: https://www.udemy.com/course/data-ana...

Are you ready to unleash the power of data analysis and gain valuable insights in just 5 hours? Look no further! In this comprehensive video course, we'll guide you through the fundamentals of Exploratory Data Analysis (EDA) from scratch, helping you unlock the potential of your data like never before.

üìä Dive into the world of EDA: You'll learn how to navigate and understand your data, identify patterns, and reveal hidden trends. From data visualization techniques to statistical analysis, this course covers it all!

üìà Gain practical knowledge: Our hands-on approach ensures that you not only grasp the theoretical concepts but also apply them to real-world scenarios. Through interactive exercises and examples, you'll develop the skills to clean, preprocess, and analyze data with confidence.

üîç Uncover valuable insights: EDA is all about uncovering the story behind the data. We'll teach you how to ask the right questions, interpret your findings, and communicate your insights effectively. Whether you're a beginner or an experienced professional, this course will take your data analysis skills to the next level.

ùêìùê¢ùê¶ùêûùêãùê¢ùêßùêû: 

‚úÖ 2:35 Agenda
‚úÖ 5:20 DA/DS Process
‚úÖ 11:58 What is EDA?
‚úÖ 15:16 What is Visualization?
‚úÖ 20:17 Steps in EDA
                   ‚úÖ 20:20 Data Sourcing
                   ‚úÖ 24:50 Data Cleaning
                   ‚úÖ 47:23 Feature Scaling
                   ‚úÖ 1:19:25 Outlier Treatment
                   ‚úÖ 1:42:42 Invalid Data
‚úÖ 1:47:43 Types of Data
‚úÖ 1:50:36 Types of Analysis
                   ‚úÖ 1:51:00 Univariate Analysis
                   ‚úÖ 2:02:26 Bivariate Analysis
                   ‚úÖ 2:07:47 Multivariate Analysis
‚úÖ 2:43:38 Derived Metrics
                   ‚úÖ 2:48:19 Feature Binning
                   ‚úÖ 3:06:03 Feature Encoding
"""  # your full input

inputs = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512
)


In [74]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    scores = torch.softmax(logits, dim=1).squeeze().tolist()

print(f"Predicted class: {predicted_class}")  # 0 = productive, 1 = unproductive (your label mapping)
print(f"Class probabilities: {scores}")


Predicted class: 0
Class probabilities: [0.9997738003730774, 0.00022625051497016102]
