<a href="https://colab.research.google.com/github/cihankaradogan/Twitter-Personality-Prediction/blob/main/prediction_gui.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setup & function definitions


## Environment Setup

In [None]:
!pip install transformers==3.5.1
!pip install turkish-twitter-preprocess==0.0.7
!pip install nltk
!pip install snscrape
!pip install emoji
!pip install torch==1.7.1



In [None]:
import torch
from google.colab import drive
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    raise SystemError('GPU device not found')

GPU: Tesla K80


## Loading models
### we have 10 different models for now
#### 5 criteria, 5 personality traits

In [None]:
# MUHIKT => Muhalefet Iktidar
# DUYGUDURUM => Duygusal Yonelim(Hayattan memnuniyet, karamsarlik)
# GELIR => Gelir seviyesi, 4 ayri kategoride degerlendirme
# ALKOL => Alkole bakış açısı
# DIN => Dine bakış açısı
PATH_MUHIKT = '/content/drive/MyDrive/modeller/iktidar_muhalefet.pt' #@param {type:"string"}
muhikt_model = torch.load(PATH_MUHIKT)

PATH_DUYGUDURUM = '/content/drive/MyDrive/modeller/duygudurum.pt' #@param {type:"string"}
duygudurum_model = torch.load(PATH_DUYGUDURUM)

PATH_GELIR = '/content/drive/MyDrive/modeller/gelir.pt' #@param {type:"string"}
gelir_model = torch.load(PATH_GELIR)

PATH_ALKOL = '/content/drive/MyDrive/modeller/alkol.pt' #@param {type:"string"}
alkol_model = torch.load(PATH_ALKOL)

PATH_DIN = '/content/drive/MyDrive/modeller/din.pt' #@param {type:"string"}
din_model = torch.load(PATH_DIN)

# AGREE, CONS, OPEN, NEVRO, EXTRA
PATH_AGREE = '/content/drive/MyDrive/modeller/Agreeableness_28k_9epoch.pt' #@param {type:"string"}
agree_model = torch.load(PATH_AGREE)

PATH_CONS = '/content/drive/MyDrive/modeller/conscientiousness.pt' #@param {type:"string"}
cons_model = torch.load(PATH_CONS)

PATH_OPEN = '/content/drive/MyDrive/modeller/Openness_28k.pt' #@param {type:"string"}
open_model = torch.load(PATH_OPEN)

PATH_NEVRO = '/content/drive/MyDrive/modeller/nevro.pt' #@param {type:"string"}
nevro_model = torch.load(PATH_NEVRO)

PATH_EXTRA = '/content/drive/MyDrive/modeller/extraversion.pt' #@param {type:"string"}
extra_model = torch.load(PATH_EXTRA)

## Function definitions

In [None]:
import sys
import ttp
import nltk
import json
import itertools
import snscrape.modules.twitter as sntwitter

In [None]:
def save_tweets_with_query(query, n_tweets, stopwords, filename, save_to_file):
    data = list()
    index = 1
    output_stream = sys.stdout
    tweets_gen = sntwitter.TwitterSearchScraper(query).get_items()
    top_tweets = itertools.islice(tweets_gen, n_tweets)
    for tweet in top_tweets:
        # writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])
        pped_sentence = ttp.preprocess_sentence(tweet.content, stopwords)
        data.append({"sentence": pped_sentence, "date": str(tweet.date)})
        output_stream.write('Scraped tweets: %s\r' % index)
        output_stream.flush()
        index += 1
    if save_to_file:
      with open(filename, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, ensure_ascii=False)
    print('')
    print('Done!')
    return data

In [None]:
from tqdm import tqdm
from tabulate import tabulate

def print_tabulate(table_values):
  table_names = ['name', 'perc(%)', 'count']
  print(tabulate(table_values, headers=table_names))

def print_percentage(df, first_criteria, second_criteria, first_criteria_val, column_name):
  total_count = 0
  first_count = 0
  second_count = 0
  for idx, row in tqdm(df.iterrows()):
    if row[column_name] == first_criteria_val:
      first_count += 1
    else:
      second_count += 1
    total_count += 1
  print()
  result_first = str(first_count / (total_count / 100))
  result_second = str(second_count / (total_count / 100))
  slice_idx_first, slice_idx_second = 4, 4
  if len(result_first) < 4:
    slice_idx_first = 2
  if len(result_second) < 4:
    slice_idx_second = 2
  table_values = [
                  [first_criteria, f"{result_first[:slice_idx_first]}%", first_count],
                  [second_criteria, f"{result_second[:slice_idx_second]}%", second_count],
                  ['total', '100%', total_count],
                ]
  print_tabulate(table_values)

def return_percentage_bigfive(df, first_criteria, first_criteria_val, column_name):
  total_count = 0
  first_count = 0
  second_count = 0
  for idx, row in tqdm(df.iterrows()):
    if row[column_name] == first_criteria_val:
      first_count += 1
    else:
      second_count += 1
    total_count += 1
  print()
  result_first = str(first_count / (total_count / 100))
  slice_idx_first, slice_idx_second = 4, 4
  if len(result_first) < 4:
    slice_idx_first = 2
  return [first_criteria, f"{result_first[:slice_idx_first]}%", first_count]

In [None]:
from operator import itemgetter
def add_count_to_map(map, col_name):
  if col_name in map.keys():
    map[col_name] += 1
  else:
    map[col_name] = 1
  map['total'] += 1
  return map

def calc_perc(map, key):
  total_count = map['total']
  result = map[key] / (total_count / 100) 
  return  "%" + str(result)[:4]

def print_percentage_multi(df, map, column_name):
  count_map = {'total': 0}
  for idx, row in tqdm(df.iterrows()):
    add_count_to_map(count_map, map[row[column_name]])
  print()
  
  table_names = ['name', 'perc(%)', 'count']
  table_values = list()
  for key in count_map:
    table_values.append([key, calc_perc(count_map, key), count_map[key]])

  table_values.sort(key=itemgetter(2))
  print(tabulate(table_values, headers=table_names))

In [None]:
def predict(model, prediction_dataloader):
  print('Prediction started on test data')
  model.eval()
  predictions , true_labels = [], []

  for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    # label_ids = b_labels.to('cpu').numpy()
    
    predictions.append(logits)
    # true_labels.append(label_ids)
  print('Prediction completed')
  return predictions

In [None]:
def map_predictions_to_df(df, predictions, column_str):
  prediction_set = []

  for i in range(len(predictions)):
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    prediction_set.append(pred_labels_i)

  prediction_scores = [item for sublist in prediction_set for item in sublist]
  df[column_str] = prediction_scores
  return df

In [None]:
def predict_query(query, n_tweets):
  print('Started fetching...')
  data = save_tweets_with_query(query, n_tweets, stop_word_list, "na", False)
  local_df = pd.DataFrame(data).drop_duplicates(['sentence'])
  print('Started formatting...')

  test_texts = local_df.sentence.values
  input_ids = list()
  attention_masks = list()
  max_len = 250

  for text in test_texts:
      encoded_dict = tokenizer.encode_plus(
                          text,                     
                          add_special_tokens = True, 
                          max_length = max_len,          
                          pad_to_max_length = True,
                          return_attention_mask = True,  
                          return_tensors = 'pt',
                          truncation=True
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)

  batch_size = 32  

  prediction_data = TensorDataset(input_ids, attention_masks)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  print('Started predicting...')
  muhikt_predictions = predict(muhikt_model, prediction_dataloader)
  duygu_predictions = predict(duygudurum_model, prediction_dataloader)
  gelir_predictions = predict(gelir_model, prediction_dataloader)
  alkol_predictions = predict(alkol_model, prediction_dataloader)
  din_predictions = predict(din_model, prediction_dataloader)

  agree_predictions = predict(agree_model, prediction_dataloader)
  cons_predictions = predict(cons_model, prediction_dataloader)
  extra_predictions = predict(extra_model, prediction_dataloader)
  nevro_predictions = predict(nevro_model, prediction_dataloader)
  open_predictions = predict(open_model, prediction_dataloader)

  map_predictions_to_df(local_df, muhikt_predictions, 'muhikt')
  map_predictions_to_df(local_df, gelir_predictions, 'gelir')
  map_predictions_to_df(local_df, duygu_predictions, 'duygu')
  map_predictions_to_df(local_df, alkol_predictions, 'alkol')
  map_predictions_to_df(local_df, din_predictions, 'din')
  
  map_predictions_to_df(local_df, agree_predictions, 'agree')
  map_predictions_to_df(local_df, cons_predictions, 'cons')
  map_predictions_to_df(local_df, extra_predictions, 'extra')
  map_predictions_to_df(local_df, nevro_predictions, 'nevro')
  map_predictions_to_df(local_df, open_predictions, 'open')
  
  print('Finished and printing results...')
  print_percentage(local_df, 'muhalefet', 'iktidar', 1, 'muhikt')
  print_percentage(local_df, 'karamsar', 'memnun', 0, 'duygu')
  print_percentage(local_df, 'alkol-olumlu', 'alkol-olumsuz', 0, 'alkol')
  print_percentage(local_df, 'din-olumlu', 'din-olumsuz', 0, 'din')
  print_percentage_multi(
    local_df,
    {0: 'işçi', 1: 'memur', 2: 'tüccar', 3: 'öğrenci'},
    'gelir'
    )
  
  bigfive_result_list = list()
  for domain in ['agree', 'cons', 'extra', 'nevro', 'open']:
    bigfive_result_list.append(return_percentage_bigfive(local_df, domain, 1, domain))
  print_tabulate(bigfive_result_list)

  print(f"{query} - {len(local_df)}")
  return local_df

## Loading finetuned model

In [None]:
finetuned = 'dbmdz/bert-base-turkish-128k-uncased' #@param {type:"string"}
tokenizer = BertTokenizer.from_pretrained(finetuned, do_lower_case=True)
nltk.download('stopwords')
stop_word_list = nltk.corpus.stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Prototype

In [None]:
username = 'Twitter Username' #@param {type:"string"}
result = predict_query(f"(from:{username}) lang:tr -filter:replies", 2500)

Started fetching...

Done!
Started formatting...




Started predicting...
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Prediction started on test data
Prediction completed
Finished and printing results...


2462it [00:00, 8765.75it/s]



name       perc(%)      count
---------  ---------  -------
muhalefet  74.6%         1838
iktidar    25.3%          624
total      100%          2462


2462it [00:00, 7904.59it/s]



name      perc(%)      count
--------  ---------  -------
karamsar  27.7%          683
memnun    72.2%         1779
total     100%          2462


2462it [00:00, 8409.94it/s]



name           perc(%)      count
-------------  ---------  -------
alkol-olumlu   40.8%         1005
alkol-olumsuz  59.1%         1457
total          100%          2462


2462it [00:00, 8196.39it/s]



name         perc(%)      count
-----------  ---------  -------
din-olumlu   26.4%          652
din-olumsuz  73.5%         1810
total        100%          2462


2462it [00:00, 7936.98it/s]



name     perc(%)      count
-------  ---------  -------
öğrenci  %15.6          385
tüccar   %22.0          543
işçi     %30.1          742
memur    %32.1          792
total    %100.         2462


2462it [00:00, 8493.39it/s]





2462it [00:00, 8862.70it/s]





2462it [00:00, 8703.89it/s]





2462it [00:00, 9053.29it/s]





2462it [00:00, 8470.76it/s]


name    perc(%)      count
------  ---------  -------
agree   39.6%          975
cons    83.1%         2046
extra   88.2%         2173
nevro   11.9%          294
open    37.2%          918
(from:Twitter Username) lang:tr -filter:replies - 2462





In [None]:
result

Unnamed: 0,sentence,date,muhikt,gelir,duygu,alkol,din,agree,cons,extra,nevro,open
0,pazar günü bulgaristan’da gerçekleşecek olan s...,2021-11-12 12:03:48+00:00,1,2,1,1,0,0,1,1,0,0
1,gönlümüzde biriken tüm umutlarımızı yeşert ala...,2021-11-12 07:31:15+00:00,0,1,1,1,0,0,0,0,1,0
2,ankara büyükşehir belediye başkanımız kıymetli...,2021-11-10 08:56:21+00:00,1,2,1,1,0,1,1,1,0,0
3,türk miletinin fikrine kalbine ve ruhuna işlen...,2021-11-09 17:34:31+00:00,1,1,1,1,1,1,1,1,0,0
4,tbm grubu toplantımızdayız,2021-11-09 06:47:03+00:00,1,1,1,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2495,sadece fransızların değil tüm insanlığın ortak...,2019-04-16 06:20:30+00:00,1,2,1,1,1,0,1,1,0,0
2496,türk milî takımı formasını hem basketbol hem d...,2019-04-12 16:33:28+00:00,1,2,1,1,1,0,1,0,0,0
2497,mesele tarih ise kanuni nin fransuva ya yazdığ...,2019-04-12 16:11:20+00:00,0,2,1,1,0,1,1,1,0,0
2498,rabim birliğimizi ve beraberliğimizi daim etsi...,2019-04-12 05:30:00+00:00,0,0,1,1,0,1,1,1,0,0
