-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
211 lines (170 loc) · 9.13 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# -*- coding: utf-8 -*-
"""preprocessing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/13PYyafzI4H9Ab8L6ByU-4Tmt0ClWQGLY
**Downloading modules**
In this part of the code, the kalbur module, which is used for finding roots of turkish words, is downloaded from github. This module is a necessity for running the code.
"""
current_path = '/content/'
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
import sys
import os
"""The kalbur module returns error because of absolute path defined in the .py script. For this reason, the path in the module named kok_tara(..) in the python script named kelime_bol.py is changed to be current_path/veri/KOKOZLER.txt."""
with open(current_path + 'kalbur/kelime_bol.py', 'r') as file :
filedata = file.read()
if "kalbur/veri/" not in filedata:
filedata = filedata.replace('veri/', current_path + "kalbur/veri/")
with open(current_path + 'kalbur/kelime_bol.py', 'w') as file:
file.write(filedata)
sys.path.append(current_path + "kalbur/")
import kelime_bol as kb
"""**Data returning and preprocessing**
The functions defined below are used to return and preprocess clickbait and non-clickbait tweets.
"""
def return_data(csv_files):
limon = pd.read_csv(csv_files["limon"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
print("# of tweets in limon:", len(limon))
evrensel = pd.read_csv(csv_files["evrensel"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
print("# of tweets in evrensel:", len(evrensel))
spoiler = pd.read_csv(csv_files["spoiler"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
print("# of tweets in spoiler:", len(spoiler))
diken = pd.read_csv(csv_files["diken"], encoding="utf-8-sig", skip_blank_lines=True).dropna()
print("# of tweets in diken:", len(diken))
return limon["full_text"].to_list() + spoiler["full_text"].to_list(), evrensel["full_text"].to_list() + diken["full_text"].to_list()
def stemmingStep(data):
sentencelist = []
tweetlist = []
for i in data:
for j in i:
if len(kb.kok_tara(j)[1]) == 0 and not len(j) == 0:
if "'" in j:
j = j.split("'")
if j[0].isalpha():
sentencelist.append(j[0])
elif "’" in j:
j = j.split("'")
if j[0].isalpha():
sentencelist.append(j[0])
else:
if j.isalpha() and not len(j) == 0:
sentencelist.append(kb.kok_tara(j)[1])
tweetlist.append(sentencelist)
sentencelist = []
return tweetlist
def count_special_characters(tweetlist, special_characters):
ntweetlist = []
special_characters_in_the_tweetlist = []
other_special_characters_in_the_tweetlist = []
uppercase_characters_in_the_tweetlist = []
for idx, tweet in enumerate(tweetlist):
if not tweet:
special_characters_in_the_tweet = [0]*len(special_characters)
other_special_characters_in_the_tweet = 0
uppercase_characters_in_the_tweet = 0
special_characters_in_the_tweetlist.append(np.array(special_characters_in_the_tweet))
other_special_characters_in_the_tweetlist.append(other_special_characters_in_the_tweet)
uppercase_characters_in_the_tweetlist.append(uppercase_characters_in_the_tweet)
ntweetlist.append(tweet)
continue
special_characters_in_the_tweet = []
other_special_characters_in_the_tweet = 0
uppercase_characters_in_the_tweet = 0
new_tweet = []
for word in tweet:
special_characters_count = []
for s in special_characters:
special_characters_count.append(word.count(s))
word = word.replace(s, "")
nword = "".join([l for l in word if l.isalnum() or l == " " or l == "'" or l == "’"])
special_characters_in_the_tweet.append(special_characters_count)
other_special_characters_in_the_tweet += len(word)- len(nword)
uppercase_characters_in_the_tweet += sum(1 if l.isupper() else 0 for l in nword)
new_tweet.append(nword.lower())
ntweetlist.append(new_tweet)
special_characters_in_the_tweetlist.append(np.sum(np.array(special_characters_in_the_tweet), axis=0))
other_special_characters_in_the_tweetlist.append(other_special_characters_in_the_tweet)
uppercase_characters_in_the_tweetlist.append(uppercase_characters_in_the_tweet)
return ntweetlist, special_characters_in_the_tweetlist, other_special_characters_in_the_tweetlist, uppercase_characters_in_the_tweetlist
def calculate_average_tweet_length(tweetlist):
tweet_lengths = [len(tweet) if not len(tweet) == 0 else 0 for tweet in tweetlist]
return np.mean(tweet_lengths), tweet_lengths
def calculate_average_word_length(tweetlist):
return [np.mean([len(w) for w in tweet]) if not len(tweet) == 0 else 0 for tweet in tweetlist]
def remove_selected_words(tweetlist, words_will_be_removed):
return [[w for w in wordlist if w not in words_will_be_removed] for wordlist in tweetlist]
def wordspace(tweetlist):
return list(set(sum([[w for w in wordlist] for wordlist in tweetlist], [])))
def coding_tweets(tweetlist, unique_word_list, word_indexes):
coded_tweets = []
for idx, tweet in enumerate(tweetlist):
coded_tweet = np.zeros((1, len(unique_word_list) + 2))
for indx in word_indexes[idx]:
coded_tweet[:, idx] += 1
coded_tweets.append(coded_tweet)
return np.squeeze(np.array(coded_tweets))
def generatesample(clickbait, non_clickbait,
special_characters, words_will_be_removed,
isseparate=False, scaling=True,
for_data_generator=False):
clickbait = [[w for w in c.split(" ") if "http" not in w] for c in clickbait] # remove the last element, which is link
non_clickbait = [[w for w in c.split(" ") if "http" not in w] for c in non_clickbait] # remove the last element, which is link
clickbait, sp_clickbait, osp_clickbait, up_clickbait = count_special_characters(clickbait, special_characters)
mean_clickbait, len_clickbait = calculate_average_tweet_length(clickbait)
word_mean_clickbait = calculate_average_word_length(clickbait)
non_clickbait, sp_non_clickbait, osp_non_clickbait, up_non_clickbait = count_special_characters(non_clickbait, special_characters)
mean_non_clickbait, len_non_clickbait = calculate_average_tweet_length(non_clickbait)
word_mean_non_clickbait = calculate_average_word_length(non_clickbait)
sample = clickbait + non_clickbait
sp_sample = sp_clickbait + sp_non_clickbait
osp_sample = osp_clickbait + osp_non_clickbait
up_sample = up_clickbait + up_non_clickbait
mean_sample = mean_clickbait + mean_non_clickbait
len_sample = len_clickbait + len_non_clickbait
word_mean = word_mean_clickbait + word_mean_non_clickbait
sample = remove_selected_words(stemmingStep(sample), words_will_be_removed)
unique_word_list = wordspace(sample)
word_indexes = [[unique_word_list.index(w) for w in tweet] for tweet in sample]
Xsc = np.c_[np.array(sp_sample),
np.array(osp_sample).reshape((-1, 1)),
np.array(up_sample).reshape((-1, 1)),
np.array(word_mean).reshape((-1, 1)),
np.array(len_sample).reshape((-1, 1))]
if scaling:
Xsc = scale(Xsc, axis=0)
Y = np.append(np.ones(len(clickbait)), np.zeros(len(non_clickbait)))
if for_data_generator:
shuffle_index = np.random.permutation(len(Xsc))
word_indexes = np.array(word_indexes)
Xsc = np.array(Xsc)
Y = np.array(Y)
return unique_word_list, word_indexes[shuffle_index], Xsc[shuffle_index], Y[shuffle_index]
X = coding_tweets(sample, unique_word_list, word_indexes)
sample_size = len(X)
shuffle_index = np.random.permutation(sample_size)
if isseparate:
X, Xsc, Y = X[shuffle_index], Xsc[shuffle_index], Y[shuffle_index]
X_test, Xsc_test, Y_test = X[:sample_size//5], Xsc[:sample_size//5], Y[:sample_size//5]
X_train, Xsc_train, Y_train = X[sample_size//5:], Xsc[sample_size//5:], Y[sample_size//5:]
return X_train, Xsc_train, Y_train, X_test, Xsc_test, Y_test, unique_word_list
else:
X = np.c_[X, Xsc]
X, Y = X[shuffle_index], Y[shuffle_index]
X_test, Y_test = X[:sample_size//5], Y[:sample_size//5]
X_train, Y_train = X[sample_size//5:], Y[sample_size//5:]
return X_train, Y_train, X_test, Y_test, unique_word_list
"""return_data(..) which returns the clickbait and non-clickbait data needs a dictionary as a parameter. The dictionary should include keys and respective file's paths."""
"""As a toy example, we select 1000 tweets from clickbait and non-clickbait data.
Additionally, one can define special_characters list for determining special characters that seem to be important for clickbait detection. For these special characters, a separate row is formed for all clickbait detection algorithms.
Features extracted from a tweet:
1. words in the tweet
2. special characters ["#", "?", "!", ".", "@"]
3. other special characters
4. number of uppercase letters
5. average word length
6. average tweet length
Nine additional features to words in the tweet are determined.
words_will_be_removed involves the suspected words that possibly help machine learning algorithms in deciding whether a tweet is clickbait or not. These words can be removed from the dataset so that models cannot exploit this problem.
"""