# Import Dependencies

In [2]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint

In [3]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Influencer Category Classification



1.   Read Data
2.   Preprocess Data
3.   Prepare Model
4.   Predict Test Data
4.   Save outputs



In [4]:
train_classification_df = pd.read_csv("train-classification.csv",)
train_classification_df = train_classification_df.rename(columns={'username': 'user_id', 'label': 'category'})



In [5]:
train_classification_df.head()

Unnamed: 0,user_id,category
0,taskirancemal,Mom and Children
1,tam_kararinda,Food
2,spart4nn,Food
3,sosyalyiyiciler,Food
4,sonaydizdarahad,Mom and Children


In [6]:
# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [7]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [8]:
username2_category = {key.strip(): value for key, value in username2_category.items()}

# Print the cleaned dictionary
print(username2_category)

{'taskirancemal': 'mom and children', 'tam_kararinda': 'food', 'spart4nn': 'food', 'sosyalyiyiciler': 'food', 'sonaydizdarahad': 'mom and children', 'somersivrioglu': 'food', 'sinankoc': 'travel', 'simulasyonturk': 'gaming', 'savas_karakas_sudaki_izler': 'travel', 'sakinenurunannesi': 'mom and children', 'ruyabuyuktetik': 'fashion', 'raykakumru': 'health and lifestyle', 'pintipanda': 'gaming', 'pinarindepresyonu': 'health and lifestyle', 'pinarhotic': 'health and lifestyle', 'pinardonmez_': 'mom and children', 'ozgeninoltasi': 'food', 'nayaozgun': 'fashion', 'nataliyarcan': 'health and lifestyle', 'muthispsikoloji': 'health and lifestyle', 'murattekecicom': 'tech', 'mrsisbeceren': 'mom and children', 'mosyosokola': 'food', 'mirandaorlayn': 'entertainment', 'mezarci_pubg': 'gaming', 'mervedemireltaskiran': 'travel', 'lifewithapineapple': 'travel', 'lalsefkatli': 'travel', 'kutupanne': 'mom and children', 'kosifcihan': 'health and lifestyle', 'kisikatescom': 'food', 'kaankural2': 'sports

In [9]:
username2_category["sosyalyiyiciler"]

'food'

In [10]:
train_data_path = "training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile


In [11]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,,11997,17,True,False,...,,,BRAND,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [12]:
test_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,Energy Company,,28025,4,True,False,...,,,ENERGY_COMPANY,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


Here our corpus is all of our post data
so we should concat aggregate all of them and provide as a corpus to the Tf-IDF vectorizer of the scikit-learn.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def preprocess_text(text: str):
    # lower casing Turkish Text, Don't use str.lower :)
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    # HERE THE EMOJIS stuff are being removed, you may want to keep them :D
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)


vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=3000)

# fit the vectorizer
vectorizer.fit(corpus)


# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]


test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [14]:
# Making sure everything is fine
assert y_train.count("NA") == 0

In [15]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['about', 'acil', 'acı', ..., 'şık', 'şıklık', 'şıklığı'],
      dtype=object)

In [16]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)
df_tfidf.head(2)

Unnamed: 0,about,acil,acı,ada,adam,adana,aday,add,adet,adlı,...,şimdi,şimdiden,şirket,şubat,şube,şubemiz,şöyle,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057254,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.029673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
print(df_tfidf.shape)
print(len(y_train))

(2741, 3000)
2741


In [18]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df_tfidf, y_train, test_size=0.2, stratify=y_train)

In [19]:
# Print the sizes to verify
print("Training set size:", len(x_train), len(y_train))
print("Validation set size:", len(x_val), len(y_val) )
#print("Test set size:", len(x_test), len(y_test) )

Training set size: 2192 2192
Validation set size: 549 549


In [20]:
#x_train.to_csv("x_train.csv", index=False)
#x_val.to_csv("x_val.csv", index=False)
#x_test.to_csv("x_test.csv", index=False)
#
#pd.DataFrame(y_train, columns=["label"]).to_csv("y_train.csv", index=False)
#pd.DataFrame(y_val, columns=["label"]).to_csv("y_val.csv", index=False)
#pd.DataFrame(y_test, columns=["label"]).to_csv("y_test.csv", index=False)

In [20]:
import numpy as np
import pandas as pd
import json
from pprint import pprint

# Naive Base Classifier

### Now we can pass the numerical values to a classifier, Let's try Naive Base!


In [21]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


model = MultinomialNB()
model.fit(x_train, y_train)

In [22]:
#@title Train Data
y_train_pred = model.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))

Accuracy: 0.6496350364963503

Classification Report:
                      precision    recall  f1-score   support

                 art       0.93      0.08      0.16       153
       entertainment       0.60      0.59      0.60       258
             fashion       0.73      0.74      0.74       239
                food       0.80      0.88      0.84       409
              gaming       0.00      0.00      0.00        10
health and lifestyle       0.52      0.81      0.63       402
    mom and children       0.88      0.06      0.11       119
              sports       1.00      0.10      0.18        90
                tech       0.73      0.81      0.77       277
              travel       0.55      0.66      0.60       235

            accuracy                           0.65      2192
           macro avg       0.67      0.47      0.46      2192
        weighted avg       0.70      0.65      0.61      2192



In [23]:
#@title Validation Data
y_val_pred = model.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

Accuracy: 0.5719489981785064

Classification Report:
                      precision    recall  f1-score   support

                 art       0.50      0.03      0.05        38
       entertainment       0.39      0.42      0.40        65
             fashion       0.65      0.52      0.57        60
                food       0.76      0.82      0.79       102
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.46      0.79      0.58       100
    mom and children       1.00      0.10      0.18        30
              sports       1.00      0.04      0.08        23
                tech       0.71      0.70      0.70        69
              travel       0.53      0.68      0.59        59

            accuracy                           0.57       549
           macro avg       0.60      0.41      0.40       549
        weighted avg       0.62      0.57      0.53       549



In [25]:
'''
y_test_pred = model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, zero_division=0)) '''

' \ny_test_pred = model.predict(x_test)\n\nprint("Accuracy:", accuracy_score(y_test, y_test_pred))\nprint("\nClassification Report:")\nprint(classification_report(y_test, y_test_pred, zero_division=0)) '

In [24]:
#@title Test Data


# let's take a look at the first 5 lines of the file
test_data_path = "test-classification-round3.dat"

test_unames = []
with open(test_data_path, "rt") as fh:
  for line in fh:
    test_unames.append(line.strip())

print(test_unames[:5])

['livapastanesi', 'barisgross', 'tusasshop', 'etolyadigital', 'tugrulonur']


In [25]:
x_test = []

for uname in test_unames:
  try:
    index = test_usernames.index(uname)
    x_test.append(x_post_test[index].toarray()[0])
  except Exception as e:
    try:
      index = train_usernames.index(uname)
      x_test.append(x_post_train[index].toarray()[0])
    except Exception as e:
      print(uname)


#test_unames.remove("screenname")

In [26]:
df_test = pd.DataFrame(np.array(x_test), columns=feature_names)
df_test.head(5)

Unnamed: 0,about,acil,acı,ada,adam,adana,aday,add,adet,adlı,...,şimdi,şimdiden,şirket,şubat,şube,şubemiz,şöyle,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068703,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.042433,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.086066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012273,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
test_pred = model.predict(df_test)

output = dict()
for index, uname in enumerate(test_unames):
  output[uname] = test_pred[index]

In [28]:
print(output)

{'livapastanesi': 'food', 'barisgross': 'food', 'tusasshop': 'tech', 'etolyadigital': 'tech', 'tugrulonur': 'entertainment', 'tulugozlu': 'entertainment', 'gokidy': 'tech', 'cengizgumus_official': 'fashion', 'krossbisiklet': 'travel', 'haribochamallows': 'food', 'ozatashipyard': 'travel', 'yenisafak': 'health and lifestyle', 'iamsiddeshjadhav': 'travel', 'burcinterzioglu': 'health and lifestyle', 'steakhousegunaydin': 'food', 'baselifeclub': 'health and lifestyle', 'benismailyildirimm': 'health and lifestyle', 'imuneksfarma': 'health and lifestyle', 'dogakoyucatalca': 'food', 'sena.sener': 'entertainment', 'kandilliborsarestaurant': 'food', 'selamiersoyy': 'food', 'deutz_fahr_turkey': 'food', 'cevaheer': 'tech', 'tezatsanat': 'entertainment', 'filtresizcom': 'health and lifestyle', 'palomamarina_suites': 'travel', 'westchocolatemarina': 'food', 'sebnemcapa': 'health and lifestyle', 'rozetsepeti': 'tech', 'ececesmioglu': 'entertainment', 'ustapidecitr': 'health and lifestyle', 'gocaagon