In [None]:
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to /home/alice/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
file_path = '../data/blogset-without-duplicate.csv'

In [None]:
def remove_html_tags(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
def remove_stopwords(text):
    all_words = text.split(" ")
    clean_text = [i for i in all_words if i not in stopwords and i!=""]
    return " ".join(clean_text)

In [None]:
def clean_text(x):
  return " ".join(str(x.translate(str.maketrans('', '', string.punctuation))).replace("#", "").split()).lower()

In [None]:
def select_abs(col, perc):
  n = int(len(df_coef) * perc)

  df_coef_1 = df_coef.copy()
  df_coef_1 = df_coef_1[[col]]

  df_coef_1['abs_values'] = df_coef_1[col].abs()

  sorted_df = df_coef_1.sort_values(by='abs_values', ascending=False)

  return sorted_df.head(n)

In [None]:
df = pd.read_csv(file_path)

df['Clean_Texts'] = df["Texts"].apply(lambda x: remove_html_tags(x))
df["Clean_Texts"] = df["Clean_Texts"].apply(lambda x: clean_text(x))
df["Clean_Texts"] = df["Clean_Texts"].apply(lambda x: remove_stopwords(x))

print(df.shape)
df = df[df['Clean_Texts']!='']
print(df.shape)

y = df["Age"]

df_train, df_test = train_test_split(df, test_size=0.2, random_state=115, stratify=y)

y = df["Age"]

X_train = df_train[['AuthorID', 'Clean_Texts']]
y_train = df_train["Age"].to_numpy()

X_test = df_test[['AuthorID', 'Clean_Texts']]
y_test = df_test["Age"].to_numpy()

(2537, 6)
(2494, 6)


In [None]:
unique_words = list(set(" ".join(X_train["Clean_Texts"].to_list()).split()))

In [None]:
threshold = X_train.shape[0]*0.01

In [None]:
word_counts = X_train['Clean_Texts'].str.split().explode().value_counts()

In [None]:
row_counts = pd.DataFrame({'word': word_counts.index, 'count': word_counts.values})

In [None]:
#row_counts['count'] = row_counts['word'].apply(lambda word: X_train['Clean_Texts'].str.contains(word).sum())
words = row_counts['word'].tolist()

# Create a dictionary to store the counts for each word
word_count_dict = {word: X_train['Clean_Texts'].str.contains(word).sum() for word in words}

# Convert the dictionary to a DataFrame or add it back to row_counts
row_counts['count'] = row_counts['word'].map(word_count_dict)

In [None]:
selected_words = row_counts[row_counts['count']>threshold]['word'].to_list()

In [None]:
len(selected_words)

42215

In [None]:
with open('selected_words.txt', 'w') as f:
    for line in selected_words:
        f.write("%s\n" % line)

In [None]:
text = X_train['Clean_Texts'].to_list()
vectorizer = TfidfVectorizer(vocabulary=selected_words)
train_matrix = vectorizer.fit_transform(text)
train_matrix = train_matrix.toarray()
vocab = vectorizer.get_feature_names_out()

tf_idf = pd.DataFrame(data=train_matrix, columns=selected_words)

In [None]:
svc = SVC(kernel='linear', decision_function_shape='ovr')
svc.fit(tf_idf, y_train)

In [None]:
df_coef = pd.DataFrame(data=svc.coef_, columns=tf_idf.columns)
df_coef = df_coef.T

coef1 = select_abs(0, 1)
coef2 = select_abs(1, 1)
coef3 = select_abs(2, 1)

In [None]:
df_train.head()

Unnamed: 0,AuthorID,Qual sua idade?,Qual seu sexo?,Texts,Age,Clean_Texts
1926,4314544603640183693,10 à 18 anos,Masculino,\n\n\n\n\n\nGeralmente pensa-se que as pessoas...,0,geralmente pensase pessoas tentam compreender ...
1992,15723770165958579349,26 à 30 anos,Masculino,"\nAspecto essencial da prática maçônica, do di...",1,aspecto essencial prática maçônica dia dia maç...
393,18322716766605752197,19 à 25 anos,Feminino,\n\n\n&nbsp;No dia 25/06/2012 – foi recolhido\...,0,nbspno dia 25062012 – recolhido pátio barracão...
1286,10855576236842125784,30 à 40 anos,Feminino,\n\n\nOlá!\n\nHoje falarei um pouco deste trab...,1,olá hoje falarei pouco deste trabalho maravilh...
2517,10544620546238483538,40 à 60 anos,Masculino,"A Secretaria da Educação do Estado realiza, de...",2,secretaria educação estado realiza 5 10 julho ...


In [None]:
df_train.to_csv('../data/train.csv')

In [None]:
df_test.head()

Unnamed: 0,AuthorID,Qual sua idade?,Qual seu sexo?,Texts,Age,Clean_Texts
249,10669220345798355617,40 à 60 anos,Masculino,\n\nLINKS PARA DOWNLOAD DE LIVROS DE BOTÂNICA ...,2,links download livros botânica ecologia pdf di...
945,16529549447616554966,40 à 60 anos,Masculino,\n\n\n\n\n\n\n\nConservatório Musical é inaugu...,2,conservatório musical inaugurado paranaguá pri...
1033,11146739159556680933,40 à 60 anos,Masculino,Home\n\nCampeonato\nFutsal Interbairros Guaíra...,2,home campeonato futsal interbairros guaíra sp ...
1881,3794287388457210312,30 à 40 anos,Masculino,\nDocumentário de Clementino Junior com o ex-m...,1,documentário clementino junior exmestresala ma...
1905,4191582511992557050,10 à 18 anos,Feminino,Ganhei este selinho personalizado da minha que...,0,ganhei selinho personalizado querida amigasuel...


In [None]:
df_test.to_csv('../data/test.csv')

In [None]:
df_train['glex1']=0
df_train['glex2']=0
df_train['glex3']=0

df_train.reset_index(inplace=True)

for index, row in df_train.iterrows():
    sum1 = 0
    text = row["Clean_Texts"].split()
    for w in text:
        if w in coef1.index.tolist():
          sum1 += coef1[0][w]*tf_idf[w][index]
    sum1 = sum1 + svc.intercept_[0]
    df_train[f'glex1'][index] = sum1

for index, row in df_train.iterrows():
    sum1 = 0
    text = row["Clean_Texts"].split()
    for w in text:
        if w in coef2.index.tolist():
          sum1 += coef2[1][w]*tf_idf[w][index]
    sum1 = sum1 + svc.intercept_[1]
    df_train[f'glex2'][index] = sum1

for index, row in df_train.iterrows():
    sum1 = 0
    text = row["Clean_Texts"].split()
    for w in text:
        if w in coef3.index.tolist():
          sum1 += coef3[2][w]*tf_idf[w][index]
    sum1 = sum1 + svc.intercept_[2]
    df_train[f'glex3'][index] = sum1

In [None]:
df_train.groupby(['Age'])[['glex1', 'glex2', 'glex3']].describe().T

Unnamed: 0,Age,0,1,2
glex1,count,519.0,781.0,695.0
glex1,mean,2.478223,-28.710805,-49.244245
glex1,std,45.44536,130.117885,267.792819
glex1,min,-385.777662,-2246.089716,-5820.55919
glex1,25%,1.940523,-7.958347,-8.000902
glex1,50%,5.37525,-3.42584,-1.099203
glex1,75%,11.219068,-1.169891,0.889272
glex1,max,166.032492,299.884886,295.723844
glex2,count,519.0,781.0,695.0
glex2,mean,-5.231571,-35.473249,-81.477835


In [None]:
df_train['c'] = -1

for index, row in df_train.iterrows():
    if row['glex1'] > 0 and row['glex2'] > 0 and row['glex3'] > 0:
      df_train['c'][index] = 0
    elif row['glex1'] < 0 and row['glex2'] > 0 and row['glex3'] > 0:
      df_train['c'][index] = 1
    elif row['glex1'] < 0 and row['glex2'] < 0 and row['glex3'] < 0:
      df_train['c'][index] = 2

df_train['Age'] = df_train['Age'].astype('int64')

In [None]:
print(f'Porcentagem classificadas: {df_train[df_train.c!=-1].shape[0]/df_train.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_train[(df_train.c!=0) & (df_train.c==df_train["Age"])].shape[0]/df_train[df_train.c!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_train[(df_train.c!=-1) & (df_train.c==df_train["Age"])].shape[0]/df_train.shape[0]}')

Porcentagem classificadas: 0.5458646616541354
Porcentagem classificadas corretamente: 0.5748393021120294
Porcentagem classificadas corretamente das totais: 0.45112781954887216


In [None]:
test_matrix = vectorizer.transform(df_test["Clean_Texts"].to_list())
test_matrix = test_matrix.toarray()

tf_idf_test = pd.DataFrame(data=test_matrix, columns=vocab)

In [None]:
df_test['glex1']=0
df_test['glex2']=0
df_test['glex3']=0

df_test.reset_index(inplace=True)

for index, row in df_test.iterrows():
      sum = 0
      text = row["Clean_Texts"].split()
      for w in text:
        if w in coef1.index.tolist():
          sum += coef1[0][w]*tf_idf_test[w][index]
      sum = sum + svc.intercept_[0]
      df_test[f'glex1'][index] = sum

for index, row in df_test.iterrows():
      sum = 0
      text = row["Clean_Texts"].split()
      for w in text:
        if w in coef2.index.tolist():
          sum += coef2[1][w]*tf_idf_test[w][index]
      sum = sum + svc.intercept_[1]
      df_test[f'glex2'][index] = sum

for index, row in df_test.iterrows():
      sum = 0
      text = row["Clean_Texts"].split()
      for w in text:
        if w in coef3.index.tolist():
          sum += coef3[2][w]*tf_idf_test[w][index]
      sum = sum + svc.intercept_[2]
      df_test[f'glex3'][index] = sum

In [None]:
df_test['c'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > 0 and row['glex2'] > 0 and row['glex3'] > 0:
      df_test['c'][index] = 0
    elif row['glex1'] < 0 and row['glex2'] > 0 and row['glex3'] > 0:
      df_test['c'][index] = 1
    elif row['glex1'] < 0 and row['glex2'] < 0 and row['glex3'] < 0:
      df_test['c'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

In [None]:
print(f'Porcentagem classificadas: {df_test[df_test.c!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.c!=-1) & (df_test.c==df_test["Age"])].shape[0]/df_test[df_test.c!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.c!=-1) & (df_test.c==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.5811623246492986
Porcentagem classificadas corretamente: 0.4896551724137931
Porcentagem classificadas corretamente das totais: 0.2845691382765531


In [None]:
dict_class_2 = df_test[df_test.c!=-1]
dict_class_2.to_csv('../data/dict_class_2.csv')

In [None]:
df_stat = df_train.groupby(['Age'])[['glex1', 'glex2', 'glex3']].describe().T

In [None]:
df_stat

Unnamed: 0,Age,0,1,2
glex1,count,519.0,781.0,695.0
glex1,mean,2.478223,-28.710805,-49.244245
glex1,std,45.44536,130.117885,267.792819
glex1,min,-385.777662,-2246.089716,-5820.55919
glex1,25%,1.940523,-7.958347,-8.000902
glex1,50%,5.37525,-3.42584,-1.099203
glex1,75%,11.219068,-1.169891,0.889272
glex1,max,166.032492,299.884886,295.723844
glex2,count,519.0,781.0,695.0
glex2,mean,-5.231571,-35.473249,-81.477835


In [None]:
df_test['c50'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > df_stat[0]['glex1']['50%'] and row['glex2'] > 0 and row['glex3'] > 0:
      df_test['c50'][index] = 0
    elif row['glex1'] < 0 and row['glex2'] > df_stat[1]['glex2']['50%'] and row['glex3'] > 0:
      df_test['c50'][index] = 1
    elif row['glex1'] < 0 and row['glex2'] < 0 and row['glex3'] < df_stat[2]['glex3']['50%']:
      df_test['c50'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

print(f'Porcentagem classificadas: {df_test[df_test.c50!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.c50!=-1) & (df_test.c50==df_test["Age"])].shape[0]/df_test[df_test.c50!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.c50!=-1) & (df_test.c50==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.35070140280561124
Porcentagem classificadas corretamente: 0.5085714285714286
Porcentagem classificadas corretamente das totais: 0.17835671342685372


In [None]:
df_test['c75'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > df_stat[0]['glex1']['75%'] and row['glex2'] > 0 and row['glex3'] > 0:
      df_test['c75'][index] = 0
    elif row['glex1'] < 0 and row['glex2'] > df_stat[1]['glex2']['75%'] and row['glex3'] > 0:
      df_test['c75'][index] = 1
    elif row['glex1'] < 0 and row['glex2'] < 0 and row['glex3'] < df_stat[2]['glex3']['75%']:
      df_test['c75'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

print(f'Porcentagem classificadas: {df_test[df_test.c75!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.c75!=-1) & (df_test.c75==df_test["Age"])].shape[0]/df_test[df_test.c75!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.c75!=-1) & (df_test.c75==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.24048096192384769
Porcentagem classificadas corretamente: 0.5416666666666666
Porcentagem classificadas corretamente das totais: 0.13026052104208416


In [None]:
df_test['cmean'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > df_stat[0]['glex1']['mean'] and row['glex2'] > 0 and row['glex3'] > 0:
      df_test['cmean'][index] = 0
    elif row['glex1'] < 0 and row['glex2'] > df_stat[1]['glex2']['mean'] and row['glex3'] > 0:
      df_test['cmean'][index] = 1
    elif row['glex1'] < 0 and row['glex2'] < 0 and row['glex3'] < df_stat[2]['glex3']['mean']:
      df_test['cmean'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

print(f'Porcentagem classificadas: {df_test[df_test.cmean!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.cmean!=-1) & (df_test.cmean==df_test["Age"])].shape[0]/df_test[df_test.cmean!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.cmean!=-1) & (df_test.cmean==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.4288577154308617
Porcentagem classificadas corretamente: 0.5
Porcentagem classificadas corretamente das totais: 0.21442885771543085


In [None]:
df_test['c5025'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > df_stat[0]['glex1']['50%'] and row['glex2'] > df_stat[0]['glex2']['25%'] and row['glex3'] > df_stat[0]['glex3']['25%']:
      df_test['c5025'][index] = 0
    elif row['glex1'] < df_stat[1]['glex1']['25%'] and row['glex2'] > df_stat[1]['glex2']['50%'] and row['glex3'] > df_stat[1]['glex3']['25%']:
      df_test['c5025'][index] = 1
    elif row['glex1'] < df_stat[2]['glex1']['25%'] and row['glex2'] < df_stat[2]['glex2']['25%'] and row['glex3'] < df_stat[2]['glex3']['50%']:
      df_test['c5025'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

print(f'Porcentagem classificadas: {df_test[df_test.c5025!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.c5025!=-1) & (df_test.c5025==df_test["Age"])].shape[0]/df_test[df_test.c5025!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.c5025!=-1) & (df_test.c5025==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.14228456913827656
Porcentagem classificadas corretamente: 0.6056338028169014
Porcentagem classificadas corretamente das totais: 0.08617234468937876


In [None]:
df_test['c7525'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > df_stat[0]['glex1']['75%'] and row['glex2'] > df_stat[0]['glex2']['25%'] and row['glex3'] > df_stat[0]['glex3']['25%']:
      df_test['c7525'][index] = 0
    elif row['glex1'] < df_stat[1]['glex1']['25%'] and row['glex2'] > df_stat[1]['glex2']['75%'] and row['glex3'] > df_stat[1]['glex3']['25%']:
      df_test['c7525'][index] = 1
    elif row['glex1'] < df_stat[2]['glex1']['25%'] and row['glex2'] < df_stat[2]['glex2']['25%'] and row['glex3'] < df_stat[2]['glex3']['75%']:
      df_test['c7525'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

print(f'Porcentagem classificadas: {df_test[df_test.c7525!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.c7525!=-1) & (df_test.c7525==df_test["Age"])].shape[0]/df_test[df_test.c7525!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.c7525!=-1) & (df_test.c7525==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.0841683366733467
Porcentagem classificadas corretamente: 0.7142857142857143
Porcentagem classificadas corretamente das totais: 0.06012024048096192


In [None]:
df_test['cmean25'] = -1

for index, row in df_test.iterrows():
    if row['glex1'] > df_stat[0]['glex1']['mean'] and row['glex2'] > df_stat[0]['glex2']['25%'] and row['glex3'] > df_stat[0]['glex3']['25%']:
      df_test['cmean25'][index] = 0
    elif row['glex1'] < df_stat[1]['glex1']['25%'] and row['glex2'] > df_stat[1]['glex2']['mean'] and row['glex3'] > df_stat[1]['glex3']['25%']:
      df_test['cmean25'][index] = 1
    elif row['glex1'] < df_stat[2]['glex1']['25%'] and row['glex2'] < df_stat[2]['glex2']['25%'] and row['glex3'] < df_stat[2]['glex3']['mean']:
      df_test['cmean25'][index] = 2

df_test['Age'] = df_test['Age'].astype('int64')

print(f'Porcentagem classificadas: {df_test[df_test.cmean25!=-1].shape[0]/df_test.shape[0]}')
print(f'Porcentagem classificadas corretamente: {df_test[(df_test.cmean25!=-1) & (df_test.cmean25==df_test["Age"])].shape[0]/df_test[df_test.cmean25!=-1].shape[0]}')
print(f'Porcentagem classificadas corretamente das totais: {df_test[(df_test.cmean25!=-1) & (df_test.cmean25==df_test["Age"])].shape[0]/df_test.shape[0]}')

Porcentagem classificadas: 0.23246492985971945
Porcentagem classificadas corretamente: 0.5517241379310345
Porcentagem classificadas corretamente das totais: 0.1282565130260521


In [None]:
test_df_ = df_test[df_test.c7525==-1]

In [None]:
dict_class = df_test[df_test.c7525!=-1]
dict_class.to_csv('../data/dict_class.csv')

In [None]:
dict_class.shape

(42, 18)

In [None]:
test_df.shape

(508, 16)

In [None]:
test_df_.shape

(413, 16)

In [None]:
test_df_.to_csv('not-classified-5025.csv')

In [None]:
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(1)
import pandas as pd
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from collections import Counter
from tensorflow import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D, Convolution1D
from keras.optimizers import Adadelta
from sklearn import metrics
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import json
import time
import datetime
import numpy as np

2024-11-09 13:13:29.949292: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-09 13:13:29.949351: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to /home/alice/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def remove_stopwords(text):
    all_words = text.split(" ")
    clean_text = [i for i in all_words if i not in stopwords and i!=""]
    return " ".join(clean_text)

def remove_html_tags(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def lower_texts(text):
    return text.lower()

def clean_text(text):
    clean_text = remove_stopwords(text)
    clean_text = remove_html_tags(clean_text)
    clean_text = lower_texts(clean_text)
    return clean_text

def create_model_age(filters = [100], kernel_size = [50], strides = [100],
                 dropout_rate = 0.5, pool_size = [5], dense_units = 100, max_len = 1000):

    model = Sequential()

    # conv 1
    model.add(Conv1D(filters = filters[0],
                     kernel_size = kernel_size[0],
                     strides = strides[0],
                     activation = 'relu',
                     input_shape = (max_len, 1) ))

    # pooling layer 1
    for i in range(len(pool_size)):
        model.add(MaxPooling1D(pool_size = pool_size[i], strides = 1))
        model.add(Activation('relu'))

    #model.add(Activation('relu'))

    model.add(Flatten())

    if dropout_rate is not None:
        model.add(Dropout(dropout_rate))

    model.add(Dense(units = dense_units, activation = 'relu'))
    model.add(Dense(units = 3, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', optimizer = Adadelta(
       learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])
    return model

def formatTime(seg):
    return str(datetime.timedelta(seconds=seg))

In [None]:
df.head()

Unnamed: 0,AuthorID,Qual sua idade?,Qual seu sexo?,Texts,Age
0,11807897332463595720,19 à 25 anos,Masculino,O designer inglês Graham Smith criou uma série...,0
1,5737856882009129069,mais de 60 anos,Masculino,\n\n\n\n\n\n\n\n\n\nA sonda espacial Soho da N...,2
2,4650502123448146932,40 à 60 anos,Masculino,O pessoal da ultra-direita anda mais animado q...,2
3,2534662484984494401,40 à 60 anos,Masculino,\n\nColaborou Vandeli - 26/12/1 12:25h\n\n\nA ...,2
4,8353750961625391396,30 à 40 anos,Masculino,\n\n\n\n\nA Sra. Tweedy (Miranda Richardson/Ná...,1


In [None]:
X = df[['AuthorID', 'Texts']]
y = df["Age"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=115, stratify=y)

X_train = X_train["Texts"].apply(clean_text).to_numpy()
y_train = y_train.to_numpy()

In [None]:
num_words_vec = []
for text in (X_train):
    num_words_vec.append(len(text.split()))

In [None]:
sum_words = 0
for i in num_words_vec:
    sum_words += i

In [None]:
mean = sum_words//len(num_words_vec)

train_texts = X_train.tolist()
tfidfvec = TfidfVectorizer(max_features = mean, max_df=0.9)
tfidfvec.fit(train_texts)
tfidf_train = tfidfvec.transform(train_texts).toarray()

X_train = tfidf_train.reshape(tfidf_train.shape[0],tfidf_train.shape[1],1)
y_train = keras.utils.to_categorical(y_train,num_classes=3)

In [None]:
X_test_ = test_df_['Clean_Texts']
y_test_ = test_df_['Age']

tfidf_test_ = tfidfvec.transform(X_test_.to_list()).toarray()
X_test_ = tfidf_test_.reshape(tfidf_test_.shape[0],tfidf_test_.shape[1],1)

y_test_ = keras.utils.to_categorical(y_test_,num_classes=3)

size_ = tfidf_test_.shape[1]

In [None]:
train_df[train_df['index']==1846]

Unnamed: 0,index,AuthorID,Texts,Clean_Texts,Age,glex1,glex2,glex3,c
100,1846,2500686344285414074,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,,1,-0.359016,-0.245551,0.174682,0


In [None]:
from keras.callbacks import EarlyStopping

model_ = create_model_age(filters=[100], kernel_size=[1], strides=[2], dropout_rate=0.6, pool_size=[4], dense_units = 512, max_len = size_)
callback = [
            EarlyStopping(patience=5, monitor='val_accuracy', min_delta=0.1, mode='max', restore_best_weights=True),
        ]

model_.fit(X_train,y_train,validation_data=(X_test_,y_test_), batch_size=32, epochs=10, callbacks=callback)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x7f61285dcf70>

In [None]:
from keras.callbacks import EarlyStopping

f1 = []
ac = []

for  i in range(10):
    model_ = create_model_age(filters=[100], kernel_size=[1], strides=[2], dropout_rate=0.6, pool_size=[4], dense_units = 512, max_len = size_)
    callback = [
                EarlyStopping(patience=5, monitor='val_accuracy', min_delta=0.1, mode='max', restore_best_weights=True),
            ]

    model_.fit(X_train,y_train,validation_data=(X_test_,y_test_), batch_size=32, epochs=10, callbacks=callback)

    y_pred_ = model_.predict(
        X_test_
    )

    import numpy as np
    y_pred_list_ = [np.argmax(x, axis=-1) for x in y_pred_]
    y_test_list_ =[np.argmax(x, axis=-1) for x in y_test_]

    dict_class = test_df[test_df.c7525!=0]['Age_']
    dict_pred = test_df[test_df.c7525!=0]['c7525']

    class_total = y_test_list_+dict_class.to_list()
    pred_total = y_pred_list_+dict_pred.to_list()

    ac.append(metrics.f1_score(class_total, pred_total, average='macro'))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
[0.4330708661417323, 0.42322834645669294, 0.4409448818897638, 0.4566929133858268, 0.4311023622047244, 0.4409448818897638, 0.43503937007874016, 0.43503937007874016, 0.4330708661417323, 0.43503937007874016]


TypeError: 'numpy.float64' object is not callable

In [None]:
print(np.mean(ac))

0.4364173228346456


In [None]:
y_pred_ = model_.predict(
    X_test_
)

import numpy as np
y_pred_list_ = [np.argmax(x, axis=-1) for x in y_pred_]
y_test_list_ =[np.argmax(x, axis=-1) for x in y_test_]

metrics.f1_score(y_test_list_, y_pred_list_, average='macro')



0.41646489104116224

In [None]:
dict_class = test_df[test_df.c7525!=0]['Age_']
dict_pred = test_df[test_df.c7525!=0]['c7525']

In [None]:
metrics.f1_score(dict_class, dict_pred, average='macro')

0.5368421052631579

In [None]:
class_total = y_test_list_+dict_class.to_list()
pred_total = y_pred_list_+dict_pred.to_list()

In [None]:
metrics.f1_score(class_total, pred_total, average='macro')

0.4389763779527559