In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [9]:
import pandas as pd
import json
import gzip

# Step 1: Define the file path
file_path = 'training-dataset.jsonl.gz'

# Step 2: Read and extract posts with profile data
all_posts = []

with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for line_number, line in enumerate(f, 1):
        try:
            data = json.loads(line)

            # Extract profile information
            profile = data.get('profile', {})
            username = profile.get('username')
            follower_count = profile.get('follower_count')
            is_verified = profile.get('is_verified')

            # Extract posts
            posts = data.get('posts', [])
            for post in posts:
                # Associate profile data with each post
                post['username'] = username
                post['follower_count'] = follower_count
                post['is_verified'] = is_verified

                # Append the enriched post to the list
                all_posts.append(post)

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line {line_number}: {e}")

# Step 3: Convert to DataFrame and set 'id' as index
df = pd.DataFrame(all_posts)

if 'id' in df.columns:
    df = df.set_index('id')
else:
    raise KeyError("The 'id' column is missing from the posts data.")

# Step 3.2: Replace 'null' strings in 'caption' with empty strings**
if 'caption' in df.columns:
    df['caption'] = df['caption'].fillna('')
else:
    print("Warning: Column 'caption' not found in DataFrame.")

# Step 4: Clean and Convert Numeric Columns
numeric_columns = ['comments_count', 'like_count', 'follower_count']
for col in numeric_columns:
    if col in df.columns:
        # Replace 'null' strings with '0'
        df[col] = df[col].replace('null', '0')
        # Convert to numeric, coercing errors to NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # Fill NaN values with 0
        df[col] = df[col].fillna(0)
    else:
        print(f"Warning: Column '{col}' not found in DataFrame.")

# Step 5: Map 'media_type' to numeric values
if 'media_type' in df.columns:
    df['media_type'] = df['media_type'].map({'VIDEO': 2, 'IMAGE': 1, 'CAROUSEL_ALBUM': 0})
    # Convert to numeric, coercing errors to NaN
    df['media_type'] = pd.to_numeric(df['media_type'], errors='coerce')
    # Fill NaN values with a default value (e.g., -1) for unmapped types
    df['media_type'] = df['media_type'].fillna(-1)
else:
    print("Warning: Column 'media_type' not found in DataFrame.")

# Step 6: Convert 'is_verified' to numeric
if 'is_verified' in df.columns:
    df['is_verified'] = df['is_verified'].map({True: 1, False: 0})
    # Convert to numeric, coercing errors to NaN
    df['is_verified'] = pd.to_numeric(df['is_verified'], errors='coerce')
    # Fill NaN values with 0 (assuming unverified if missing)
    df['is_verified'] = df['is_verified'].fillna(0)
else:
    print("Warning: Column 'is_verified' not found in DataFrame.")

# Step 7: Handle 'username' Column
# Example: Encode 'username' as categorical codes
if 'username' in df.columns:
    df['username_encoded'] = df['username'].astype('category').cat.codes
else:
    print("Warning: Column 'username' not found in DataFrame.")

# Alternatively, uncomment the following lines for one-hot encoding
# if 'username' in df.columns:
#     df = pd.get_dummies(df, columns=['username'], prefix='user')
# else:
#     print("Warning: Column 'username' not found in DataFrame.")

# Step 8: Drop 'media_url' column
if 'media_url' in df.columns:
    df = df.drop(columns=['media_url'])
else:
    print("Warning: Column 'media_url' not found in DataFrame.")

# Step 9: Convert 'timestamp' to datetime
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
else:
    print("Warning: Column 'timestamp' not found in DataFrame.")

# Step 11: Feature Engineering (Optional)
# Example: Extract hour and day of week from timestamp
if 'timestamp' in df.columns:
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month

# Example: Create 'likes_per_comment' feature
df['likes_per_comment'] = df['like_count'] / (df['comments_count'] + 1)

# Step 12: Data Validation
# Check for duplicates
if df.index.is_unique:
    print("All IDs are unique.")
else:
    print(f"Found {df.index.duplicated().sum()} duplicate IDs.")
    df = df[~df.index.duplicated(keep='first')]

All IDs are unique.


In [10]:
df

Unnamed: 0_level_0,caption,comments_count,like_count,media_type,timestamp,username,follower_count,is_verified,username_encoded,hour,day_of_week,month,likes_per_comment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
17990918969458720,Cumhuriyetimizin 100.yılı kutlu olsun♾️🇹🇷,0,6.0,1,2023-10-29 09:12:30,deparmedya,1167,0,1303,9,6,10,6.000000
18219250732221045,Oriflame Duologi Lansmanı #isveçtengelengüzell...,1,22.0,2,2023-08-08 19:11:13,deparmedya,1167,0,1303,19,1,8,11.000000
18311380465102328,#oriflameilesaçbakımdevrimi ✌️,0,19.0,2,2023-08-07 21:40:54,deparmedya,1167,0,1303,21,0,8,19.000000
18089518138361507,✌️#oriflameilesaçbakımdevrimi 07Agustos’23 ori...,1,19.0,2,2023-08-07 21:27:48,deparmedya,1167,0,1303,21,0,8,9.500000
18012743929758497,07 Agustos’23 #oriflameturkiye #duoloji,0,21.0,2,2023-08-07 21:12:06,deparmedya,1167,0,1303,21,0,8,21.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17970896356402818,Alışılmışın Dışına Çık!\n\n📍İşçi Blokları mh. ...,1,34.0,0,2021-06-07 07:49:51,tetrancaffeine,1672,0,4780,7,0,6,17.000000
17930502130524900,Bayramınız Kutlu olsun🥳🤩,0,36.0,1,2021-05-13 12:21:14,tetrancaffeine,1672,0,4780,12,3,5,36.000000
17889308990150405,Ben Bir Kahve Aşığıyım!\n\nİddialı Kahvelerin ...,2,79.0,1,2021-05-08 16:02:10,tetrancaffeine,1672,0,4780,16,5,5,26.333333
17869632995380257,"Sonuçlar 332plus sayfasından,11.05.2021 tarihi...",380,199.0,1,2021-05-07 15:31:15,tetrancaffeine,1672,0,4780,15,4,5,0.522310


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 187302 entries, 17990918969458720 to 17918493004647925
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   caption            187302 non-null  object        
 1   comments_count     187302 non-null  int64         
 2   like_count         187302 non-null  float64       
 3   media_type         187302 non-null  int64         
 4   timestamp          187302 non-null  datetime64[ns]
 5   username           187302 non-null  object        
 6   follower_count     187302 non-null  int64         
 7   is_verified        187302 non-null  int64         
 8   username_encoded   187302 non-null  int16         
 9   hour               187302 non-null  int32         
 10  day_of_week        187302 non-null  int32         
 11  month              187302 non-null  int32         
 12  likes_per_comment  187302 non-null  float64       
dtypes: datetime64[ns](1), 

In [12]:
# Display first few rows
print(df.head())

                                                             caption  \
id                                                                     
17990918969458720          Cumhuriyetimizin 100.yılı kutlu olsun♾️🇹🇷   
18219250732221045  Oriflame Duologi Lansmanı #isveçtengelengüzell...   
18311380465102328                     #oriflameilesaçbakımdevrimi ✌️   
18089518138361507  ✌️#oriflameilesaçbakımdevrimi 07Agustos’23 ori...   
18012743929758497            07 Agustos’23 #oriflameturkiye #duoloji   

                   comments_count  like_count  media_type           timestamp  \
id                                                                              
17990918969458720               0         6.0           1 2023-10-29 09:12:30   
18219250732221045               1        22.0           2 2023-08-08 19:11:13   
18311380465102328               0        19.0           2 2023-08-07 21:40:54   
18089518138361507               1        19.0           2 2023-08-07 21:27:48   
180127439

In [13]:
# Check for missing values
print(df.isnull().sum())

caption              0
comments_count       0
like_count           0
media_type           0
timestamp            0
username             0
follower_count       0
is_verified          0
username_encoded     0
hour                 0
day_of_week          0
month                0
likes_per_comment    0
dtype: int64


In [14]:
# Load the tokenizer and model
model_name = 'dbmdz/bert-base-turkish-128k-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# If you have a GPU, you can move the model to GPU for faster processing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/386 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/740M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(128000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [15]:
def get_bert_embedding(text, tokenizer, model, device, max_length=128):
    """
    Generates BERT embedding for a given text using the [CLS] token.
    """
    with torch.no_grad():
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the [CLS] token's embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

    return cls_embedding

In [16]:
# Function to generate embeddings for the entire dataset
def generate_embeddings(df, text_column, tokenizer, model, device, max_length=128, batch_size=512):
    embeddings = []
    texts = df[text_column].astype(str).tolist()
    num_texts = len(texts)

    for i in tqdm(range(0, num_texts, batch_size), desc="Generating BERT Embeddings"):
        batch_texts = texts[i:i+batch_size]
        with torch.no_grad():
            inputs = tokenizer.batch_encode_plus(
                batch_texts,
                add_special_tokens=True,
                max_length=max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )

            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Extract the [CLS] token's embeddings
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)

    # Concatenate all batch embeddings
    embeddings = np.vstack(embeddings)
    return embeddings

# Generate embeddings
bert_embeddings = generate_embeddings(df, 'caption', tokenizer, model, device)

Generating BERT Embeddings: 100%|██████████| 366/366 [24:46<00:00,  4.06s/it]


In [17]:
# Convert embeddings to a DataFrame
embedding_df = pd.DataFrame(bert_embeddings,
                            columns=[f'bert_{i}' for i in range(bert_embeddings.shape[1])])

# Concatenate with the original DataFrame
df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)

In [18]:
# Select BERT embedding columns
bert_cols = [col for col in df.columns if col.startswith('bert_')]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the embeddings
scaled_embeddings = scaler.fit_transform(df[bert_cols])

In [19]:
# Initialize PCA
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% variance

# Fit and transform
pca_embeddings = pca.fit_transform(scaled_embeddings)

print(f'Original BERT dimensions: {scaled_embeddings.shape[1]}')
print(f'Reduced dimensions after PCA: {pca_embeddings.shape[1]}')

Original BERT dimensions: 768
Reduced dimensions after PCA: 239


In [20]:
# Convert PCA embeddings to a DataFrame
pca_df = pd.DataFrame(pca_embeddings,
                      columns=[f'pca_{i}' for i in range(pca_embeddings.shape[1])])

# Concatenate with the original DataFrame
df = pd.concat([df.reset_index(drop=True), pca_df.reset_index(drop=True)], axis=1)

# Drop the original BERT embedding columns to save memory
df.drop(bert_cols, axis=1, inplace=True)

In [21]:
# Identify all feature columns except the target variable (like_count)
# Assuming 'like_count' is your target variable

feature_cols = [
    'comments_count', 'media_type', 'follower_count', 'is_verified',
    'username_encoded', 'hour', 'day_of_week', 'month', 'likes_per_comment'
]

# Add PCA features
pca_feature_cols = [col for col in df.columns if col.startswith('pca_')]
feature_cols += pca_feature_cols

# Define target variable
target = 'like_count'

In [22]:
# Check for missing values
print(df[feature_cols + [target]].isnull().sum())

# If there are missing values, decide on a strategy (e.g., imputation)
# For simplicity, we'll drop rows with missing values
df.dropna(subset=feature_cols + [target], inplace=True)

comments_count      0
media_type          0
follower_count      0
is_verified         0
username_encoded    0
                   ..
pca_235             0
pca_236             0
pca_237             0
pca_238             0
like_count          0
Length: 249, dtype: int64


In [23]:
X = df[feature_cols]
y = df[target]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42
)

In [25]:
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    return rmse

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb

# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

# Scale features for Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Linear Regression model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Evaluate Linear Regression model
lr_rmse = evaluate_model(lr, X_test_scaled, y_test)

# XGBoost: Specify objective and hyperparameters
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100
)

early_stopping = xgb_reg.fit(
    X_train, y_train)

# Evaluate XGBoost model
xgb_rmse = evaluate_model(xgb_reg, X_test, y_test)

print(f'Linear Regression RMSE: {lr_rmse:.4f}')
print(f'XGBoost Regressor RMSE: {xgb_rmse:.4f}')

Linear Regression RMSE: 41636.1485
XGBoost Regressor RMSE: 10185.9147


In [57]:
rmse_values = {
    'Linear Regression': lr_rmse,
    'XGBoost': xgb_rmse
}

# Display RMSE values
for model_name, rmse in rmse_values.items():
    print(f'{model_name} RMSE: {rmse:.4f}')

# Find the model with the lowest RMSE
best_model_name = min(rmse_values, key=rmse_values.get)
best_rmse = rmse_values[best_model_name]
print(f'\nBest Model: {best_model_name} with RMSE: {best_rmse:.4f}')

Linear Regression RMSE: 41636.1485
XGBoost RMSE: 10185.9147

Best Model: XGBoost with RMSE: 10185.9147
