In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [None]:
!pip install nltk



In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords

turkish_stopwords = stopwords.words('turkish')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_classification_path = "/content/drive/MyDrive/yeni_released_dataset/train-classification.csv"
train_data_path = "/content/drive/MyDrive/yeni_released_dataset/training-dataset.jsonl.gz"

train_classification_df = pd.read_csv(train_classification_path)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [None]:
import gzip
import json

username2posts_train = {}
username2profile_train = {}
username2posts_test = {}
username2profile_test = {}

with gzip.open(train_data_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        profile = sample["profile"]
        username = profile["username"]

        if username in username2_category:  # Training data
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:  # Test data
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

In [None]:
train_data = []

for username, posts in username2posts_train.items():
    category = username2_category[username]
    for post in posts:
        train_data.append({
            'username': username,
            'category': category,
            'post': post.get('caption', '')  # Use empty string if no caption
        })

df_train = pd.DataFrame(train_data)


In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re
import emoji

def preprocess_text(text: str):
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))
    # Lowercasing
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters (but keep # and @ for hashtags and mentions)
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)
    # Remove numbers (if desired)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [None]:
# Corpus preparation
corpus = []
train_usernames = []

for username, posts in username2posts_train.items():
    cleaned_captions = [preprocess_text(post.get("caption", "")) for post in posts if post.get("caption", "")]
    user_post_captions = "\n".join(cleaned_captions)

    if user_post_captions:  # Only add if captions exist
        if username in username2_category:  # Ensure the username is in the category dict
            train_usernames.append(username)
            corpus.append(user_post_captions)

y_train = [username2_category[uname] for uname in train_usernames]

# TF-IDF with bi-grams and sublinear term frequency
vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000, ngram_range=(1, 2), sublinear_tf=True)
x_post_train = vectorizer.fit_transform(corpus)

# Dimensionality reduction with TruncatedSVD
svd = TruncatedSVD(n_components=300)
x_post_train = svd.fit_transform(x_post_train)


# Test data
test_usernames = []
test_corpus = []

for username, posts in username2posts_test.items():
    test_usernames.append(username)
    cleaned_captions = [preprocess_text(post.get("caption", "")) for post in posts if post.get("caption", "")]
    user_post_captions = "\n".join(cleaned_captions)
    test_corpus.append(user_post_captions)

# Transform the test set
x_post_test = vectorizer.transform(test_corpus)
x_post_test = svd.transform(x_post_test)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_post_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [None]:
# Train SGDClassifier
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
sgd_model.fit(X_train, y_train)
y_pred_sgd = sgd_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_sgd):.4f}")

ValueError: Found input variables with inconsistent numbers of samples: [75859, 2188]