# Import Libraries

In [12]:
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
import re

# Data Preprocessing

In [5]:
 # Load Dataset

data = pd.read_csv("/content/drive/MyDrive/university_text_classification_220k.csv")
data.head()

Unnamed: 0,ticket_text,category
0,acil muhasebe internet bağlantısı,Bilgisayar & Bakım
1,"Sayın Yetkili, mali sistem'de bulunan sistemde...",Web ve Yazılım
2,"Kolay gelsin, toplantı salonu - kürsü yapımı p...",Mobilya & Marangozluk
3,2. kat binasında duvar yıkım sorunu yaşıyoruz.,İnşaat
4,"Değerli ekip, kafeterya'de tesisat bakımı konu...",Tesisat


In [6]:
# Check the class distribution and missing values

print(data['category'].value_counts())
print("\n\n\n Missing Values: ",data.isnull().sum())

category
Bilgisayar & Bakım                            10000
Web ve Yazılım                                10000
Mobilya & Marangozluk                         10000
İnşaat                                        10000
Tesisat                                       10000
Şehir içi Taşıt Talepleri                     10000
Diploma Sistemi                               10000
Uzaktan Eğitim                                10000
Taşıma Hizmetleri                             10000
İşçi Onaylı Bordro ve Gelir Belgesi Talebi    10000
EBYS                                          10000
Kampüs İçi Taşıt Talepleri                    10000
Asansör                                       10000
Teknik Destek                                 10000
Mobil Uygulama                                10000
Tahakkuk Birimi                               10000
Öğrenci İşleri Daire Başkanlığı               10000
Telefon                                       10000
Temizlik Hizmetleri                           10000
Isı

In [7]:
nltk.download("stopwords")
stop_words = set(stopwords.words('turkish'))



def clean_text(text):
    """
    Clean the input text by performing the following steps:
    - Convert to lowercase
    - Remove punctuation
    - Remove numbers
    - Remove stopwords using NLTK's Turkish stopwords
    - Remove special characters/emojis
    """

    text = str(text)
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Remove stopwords using NLTK's Turkish stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # Remove special characters/emojis (keeps Turkish characters: ç, ğ, ı, ö, ş, ü)
    text = re.sub(r'[^\u00E7\u011F\u0131\u00F6\u015F\u00FC\s\w]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text




data["cleaned_text"] = data["ticket_text"].apply(clean_text)
print(data["cleaned_text"])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0                         acil muhasebe internet bağlantısı
1         sayın yetkili mali sistemde bulunan sistemde h...
2         kolay gelsin toplantı salonu kürsü yapımı prob...
3                kat binasında duvar yıkım sorunu yaşıyoruz
4         değerli ekip kafeteryade tesisat bakımı konusu...
                                ...                        
219995                 öğrenci yurdu elektrik panosu sorunu
219996    bölüm başkanlığı binasında telefon bakımı soru...
219997    merhaba bölüm ders programıdeki ders programı ...
219998    merhabalar sekreterlik lokasyonunda yaşanan do...
219999    öğrenci portalı binasında kurs erişimi sorunu ...
Name: cleaned_text, Length: 220000, dtype: object


In [8]:
def split_data(data, text_column, label_column):
    """
    Split the dataset into training (80%), validation (10%), and test (10%) sets.
    """
    X = data[text_column]  # Use cleaned_text column from previous step
    y = data[label_column]

    # Split into train (80%) and temp (20%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

    # Split temp into validation (50%) and test (50%) to get 10% each
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    print("\nDataset Split Sizes:")
    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")
    print(f"Test set: {len(X_test)} samples")

    return X_train, X_val, X_test, y_train, y_val, y_test


# Run the split
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data, 'cleaned_text', 'category')


Dataset Split Sizes:
Training set: 176000 samples
Validation set: 22000 samples
Test set: 22000 samples


In [9]:
# Import necessary libraries


def vectorize_bert(X_train, X_val, X_test, model_name="dbmdz/bert-base-turkish-cased", batch_size=32):
    """
    Convert text data to BERT embeddings using a pre-trained Turkish BERT model.
    - Generates [CLS] token embeddings for each text
    - Processes texts in batches to handle large datasets
    """
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    def get_sentence_embedding(texts):
        """Generate BERT embeddings for a batch of texts."""
        # Tokenize texts
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        # Move inputs to device
        inputs = {key: val.to(device) for key, val in inputs.items()}
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        # Extract [CLS] token embeddings (first token)
        return outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Process texts in batches
    def process_in_batches(texts):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = get_sentence_embedding(batch_texts)
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

    # Convert train, validation, and test texts to BERT embeddings
    X_train_bert = process_in_batches(X_train.tolist())
    X_val_bert = process_in_batches(X_val.tolist())
    X_test_bert = process_in_batches(X_test.tolist())

    print("\nBERT Embedding Shapes:")
    print(f"Training: {X_train_bert.shape}")
    print(f"Validation: {X_val_bert.shape}")
    print(f"Test: {X_test_bert.shape}")

    return X_train_bert, X_val_bert, X_test_bert, tokenizer, model


In [13]:
    X_train, X_temp, y_train, y_temp = train_test_split(data['cleaned_text'], data['category'], test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Run BERT vectorization
    X_train_bert, X_val_bert, X_test_bert, tokenizer, model = vectorize_bert(X_train, X_val, X_test)

KeyboardInterrupt: 