preprocessing.py

# -*- coding: utf-8 -*-
"""preprocessing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/13PYyafzI4H9Ab8L6ByU-4Tmt0ClWQGLY

**Downloading modules**

In this part of the code, the kalbur module, which is used for finding roots of turkish words, is downloaded from github. This module is a necessity for running the code.
"""

current_path = '/content/'

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
import sys
import os

"""The kalbur module returns error because of absolute path defined in the .py script. For this reason, the path in the module named kok_tara(..) in the python script named kelime_bol.py is changed to be current_path/veri/KOKOZLER.txt."""

with open(current_path + 'kalbur/kelime_bol.py', 'r') as file :
  filedata = file.read()

if "kalbur/veri/" not in filedata:
  filedata = filedata.replace('veri/', current_path + "kalbur/veri/")

  with open(current_path + 'kalbur/kelime_bol.py', 'w') as file:
    file.write(filedata)

sys.path.append(current_path + "kalbur/")
import kelime_bol as kb

"""**Data returning and preprocessing**

The functions defined below are used to return and preprocess clickbait and non-clickbait tweets.
"""

def return_data(csv_files):
  limon = pd.read_csv(csv_files["limon"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
  print("# of tweets in limon:", len(limon))
  evrensel = pd.read_csv(csv_files["evrensel"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
  print("# of tweets in evrensel:", len(evrensel))
  spoiler = pd.read_csv(csv_files["spoiler"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
  print("# of tweets in spoiler:", len(spoiler))
  diken = pd.read_csv(csv_files["diken"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
  print("# of tweets in diken:", len(diken))
  return limon["full_text"].to_list() + spoiler["full_text"].to_list(), evrensel["full_text"].to_list() + diken["full_text"].to_list()

def stemmingStep(data):
  sentencelist = []
  tweetlist = []
  for i in data:
    for j in i:
      if len(kb.kok_tara(j)[1]) == 0 and not len(j) == 0:
        if "'" in j:
          j = j.split("'")
          if j[0].isalpha():
            sentencelist.append(j[0])
        elif "’" in j:
          j = j.split("'")
          if j[0].isalpha():
            sentencelist.append(j[0])
      else:
        if j.isalpha() and not len(j) == 0:
          sentencelist.append(kb.kok_tara(j)[1])
    tweetlist.append(sentencelist)
    sentencelist = []
  return tweetlist

def count_special_characters(tweetlist, special_characters):
  ntweetlist = []
  special_characters_in_the_tweetlist = []
  other_special_characters_in_the_tweetlist = []
  uppercase_characters_in_the_tweetlist = []
  for idx, tweet in enumerate(tweetlist):
    if not tweet:
      special_characters_in_the_tweet = [0]*len(special_characters)
      other_special_characters_in_the_tweet = 0
      uppercase_characters_in_the_tweet = 0
      special_characters_in_the_tweetlist.append(np.array(special_characters_in_the_tweet))
      other_special_characters_in_the_tweetlist.append(other_special_characters_in_the_tweet)
      uppercase_characters_in_the_tweetlist.append(uppercase_characters_in_the_tweet)
      ntweetlist.append(tweet)
      continue
    special_characters_in_the_tweet = []
    other_special_characters_in_the_tweet = 0
    uppercase_characters_in_the_tweet = 0
    new_tweet = []
    for word in tweet:
      special_characters_count = []
      for s in special_characters:
        special_characters_count.append(word.count(s))
        word = word.replace(s, "")
      nword = "".join([l for l in word if l.isalnum() or l == " " or l == "'" or l == "’"])
      special_characters_in_the_tweet.append(special_characters_count)
      other_special_characters_in_the_tweet += len(word)- len(nword)
      uppercase_characters_in_the_tweet += sum(1 if l.isupper() else 0 for l in nword)
      new_tweet.append(nword.lower())
    ntweetlist.append(new_tweet)
    special_characters_in_the_tweetlist.append(np.sum(np.array(special_characters_in_the_tweet), axis=0))
    other_special_characters_in_the_tweetlist.append(other_special_characters_in_the_tweet)
    uppercase_characters_in_the_tweetlist.append(uppercase_characters_in_the_tweet)
  return ntweetlist, special_characters_in_the_tweetlist, other_special_characters_in_the_tweetlist, uppercase_characters_in_the_tweetlist

def calculate_average_tweet_length(tweetlist):
  tweet_lengths = [len(tweet) if not len(tweet) == 0 else 0 for tweet in tweetlist]
  return np.mean(tweet_lengths), tweet_lengths

def calculate_average_word_length(tweetlist):
  return [np.mean([len(w) for w in tweet]) if not len(tweet) == 0 else 0 for tweet in tweetlist]
    
def remove_selected_words(tweetlist, words_will_be_removed):
  return [[w for w in wordlist if w not in words_will_be_removed] for wordlist in tweetlist]

def wordspace(tweetlist):
  return list(set(sum([[w for w in wordlist] for wordlist in tweetlist], [])))

def coding_tweets(tweetlist, unique_word_list, word_indexes):
  coded_tweets = []
  for idx, tweet in enumerate(tweetlist):
    coded_tweet = np.zeros((1, len(unique_word_list) + 2))
    for indx in word_indexes[idx]:
      coded_tweet[:, idx] += 1
    coded_tweets.append(coded_tweet)
  return np.squeeze(np.array(coded_tweets))

def  generatesample(clickbait, non_clickbait, 
                    special_characters, words_will_be_removed, 
                    isseparate=False, scaling=True, 
                    for_data_generator=False):
  
  clickbait = [[w for w in c.split(" ") if "http" not in w] for c in clickbait] # remove the last element, which is link
  non_clickbait = [[w for w in c.split(" ") if "http" not in w] for c in non_clickbait] # remove the last element, which is link

  clickbait, sp_clickbait, osp_clickbait, up_clickbait = count_special_characters(clickbait, special_characters)
  mean_clickbait, len_clickbait = calculate_average_tweet_length(clickbait)
  word_mean_clickbait = calculate_average_word_length(clickbait)

  non_clickbait, sp_non_clickbait, osp_non_clickbait, up_non_clickbait = count_special_characters(non_clickbait, special_characters)
  mean_non_clickbait, len_non_clickbait = calculate_average_tweet_length(non_clickbait)
  word_mean_non_clickbait = calculate_average_word_length(non_clickbait)

  sample = clickbait + non_clickbait
  sp_sample = sp_clickbait + sp_non_clickbait
  osp_sample = osp_clickbait + osp_non_clickbait
  up_sample = up_clickbait + up_non_clickbait
  mean_sample = mean_clickbait + mean_non_clickbait
  len_sample = len_clickbait + len_non_clickbait
  word_mean = word_mean_clickbait + word_mean_non_clickbait

  sample = remove_selected_words(stemmingStep(sample), words_will_be_removed)
  unique_word_list = wordspace(sample)
  word_indexes = [[unique_word_list.index(w) for w in tweet] for tweet in sample]

  Xsc = np.c_[np.array(sp_sample), 
              np.array(osp_sample).reshape((-1, 1)), 
              np.array(up_sample).reshape((-1, 1)),
              np.array(word_mean).reshape((-1, 1)),
              np.array(len_sample).reshape((-1, 1))]
  if scaling:
    Xsc = scale(Xsc, axis=0)

  Y = np.append(np.ones(len(clickbait)), np.zeros(len(non_clickbait)))

  if for_data_generator:
    shuffle_index = np.random.permutation(len(Xsc))
    word_indexes = np.array(word_indexes)
    Xsc = np.array(Xsc)
    Y = np.array(Y)
    return unique_word_list, word_indexes[shuffle_index], Xsc[shuffle_index], Y[shuffle_index]

  X = coding_tweets(sample, unique_word_list, word_indexes)
  sample_size = len(X)
  shuffle_index = np.random.permutation(sample_size)

  if isseparate:
    X, Xsc, Y = X[shuffle_index], Xsc[shuffle_index], Y[shuffle_index]

    X_test, Xsc_test, Y_test = X[:sample_size//5], Xsc[:sample_size//5], Y[:sample_size//5]
    X_train, Xsc_train, Y_train = X[sample_size//5:], Xsc[sample_size//5:], Y[sample_size//5:]
    return X_train, Xsc_train, Y_train, X_test, Xsc_test, Y_test, unique_word_list
  else:
    X = np.c_[X, Xsc]
    X, Y = X[shuffle_index], Y[shuffle_index]
    X_test, Y_test = X[:sample_size//5], Y[:sample_size//5]
    X_train, Y_train = X[sample_size//5:], Y[sample_size//5:]
    return X_train, Y_train, X_test, Y_test, unique_word_list

"""return_data(..) which returns the clickbait and non-clickbait data needs a dictionary as a parameter. The dictionary should include keys and respective file's paths."""

"""As a toy example, we select 1000 tweets from clickbait and non-clickbait data. 

Additionally, one can define special_characters list for determining special characters that seem to be important for clickbait detection. For these special characters, a separate row is formed for all clickbait detection algorithms.

Features extracted from a tweet:

1.   words in the tweet 
2.   special characters ["#", "?", "!", ".", "@"]
3.   other special characters
4.   number of uppercase letters
5.   average word length
6.   average tweet length


Nine additional features to words in the tweet are determined.

words_will_be_removed involves the suspected words that possibly help machine learning algorithms in deciding whether a tweet is clickbait or not. These words can be removed from the dataset so that models cannot exploit this problem.
"""