In [1]:
from numpy.random import seed
seed(1)
import tensorflow 
tensorflow.random.set_seed(1) 
import pandas as pd


In [2]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alici\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [6]:
from tensorflow import keras

In [7]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D, Convolution1D
from keras.optimizers import Adadelta

In [8]:
from sklearn import metrics

In [9]:
import numpy as np

In [10]:
def remove_stopwords(text):
    all_words = text.split(" ")
    clean_text = [i for i in all_words if i not in stopwords and i!=""]
    return " ".join(clean_text)

In [11]:
def remove_html_tags(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [12]:
def lower_texts(text):
    return text.lower()

In [13]:
def clean_text(text):
    clean_text = remove_stopwords(text)
    clean_text = remove_html_tags(clean_text)
    clean_text = lower_texts(clean_text)
    return clean_text

In [14]:
def create_model_gender(filters = [100], kernel_size = [50], strides = [100], 
                 dropout_rate = 0.5, pool_size = [5], dense_units = 100, max_len = 1000):

    model = Sequential()

    # conv 1
    model.add(Conv1D(filters = filters[0], 
                     kernel_size = kernel_size[0],
                     strides = strides[0], 
                     activation = 'relu', 
                     input_shape = (max_len, 1) ))

    # pooling layer 1
    for i in range(len(pool_size)):
        model.add(MaxPooling1D(pool_size = pool_size[i], strides = 1))
        model.add(Activation('relu'))
    
    #model.add(Activation('relu'))
    
    model.add(Flatten())
    
    if dropout_rate is not None:
        model.add(Dropout(dropout_rate))
        
    model.add(Dense(units = dense_units, activation = 'relu'))
    model.add(Dense(units = 2, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', optimizer = Adadelta(
       learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])
    return model

In [15]:
def create_model_age(filters = [100], kernel_size = [50], strides = [100], 
                 dropout_rate = 0.5, pool_size = [5], dense_units = 100, max_len = 1000):

    model = Sequential()

    # conv 1
    model.add(Conv1D(filters = filters[0], 
                     kernel_size = kernel_size[0],
                     strides = strides[0], 
                     activation = 'relu', 
                     input_shape = (max_len, 1) ))

    # pooling layer 1
    for i in range(len(pool_size)):
        model.add(MaxPooling1D(pool_size = pool_size[i], strides = 1))
        model.add(Activation('relu'))
    
    #model.add(Activation('relu'))
    
    model.add(Flatten())
    
    if dropout_rate is not None:
        model.add(Dropout(dropout_rate))
        
    model.add(Dense(units = dense_units, activation = 'relu'))
    model.add(Dense(units = 3, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', optimizer = Adadelta(
       learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])
    return model

In [17]:
df_train = pd.read_csv(r"C:\Users\alici\Documents\tcc\github2\tcc-v2\blogset-br\particoes\houdout\train.csv")
df_test = pd.read_csv(r"C:\Users\alici\Documents\tcc\github2\tcc-v2\blogset-br\particoes\houdout\test.csv")

f = []
l_fem = []
l_masc = []
h_total = []
for _ in range(10):
    X_train = df_train["Texts"].apply(clean_text).to_numpy()
    X_test = df_test["Texts"].apply(clean_text).to_numpy()
    y_train_gender = df_train["GenderClass"].to_numpy()
    y_test_gender = df_test["GenderClass"].to_numpy()

    num_words = []
    for text in (X_train.tolist()+X_test.tolist()):
        num_words.append(len(text.split()))

    mean = sum(num_words)//len(num_words)

    train_texts = X_train.tolist()
    test_texts = X_test.tolist()

    tfidfvec = TfidfVectorizer(max_features = mean, max_df=0.9)
    tfidfvec.fit(train_texts)
    tfidf_train = tfidfvec.transform(train_texts).toarray()
    tfidf_test = tfidfvec.transform(test_texts).toarray()

    X_train = tfidf_train.reshape(tfidf_train.shape[0],tfidf_train.shape[1],1)
    X_test = tfidf_test.reshape(tfidf_test.shape[0],tfidf_test.shape[1],1)

    y_train_gander_cat = keras.utils.to_categorical(y_train_gender,num_classes=2)
    y_test_gender_cat = keras.utils.to_categorical(y_test_gender,num_classes=2)

    size = X_test.shape[1]

    model_gender = Sequential()
    model_gender.add(Conv1D(filters = 100, 
                        kernel_size = 3,
                        strides = 1, 
                        activation = 'relu', 
                        input_shape = (size, 1) ))
    model_gender.add(MaxPooling1D(pool_size = 4, strides = 1))
    model_gender.add(Activation('relu')) # add layer
    model_gender.add(Flatten())
    model_gender.add(Dropout(0.4))
    model_gender.add(Dense(units = 512, activation = 'relu')) #add relu
    model_gender.add(Dense(units = 2, activation = 'softmax'))
    model_gender.compile(loss='categorical_crossentropy', optimizer = Adadelta(
        learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])

    model_gender.fit(X_train,y_train_gander_cat,validation_data=(X_test,y_test_gender_cat), batch_size=32)

    y_pred_gender = model_gender.predict(
        X_test
    )

    y_pred_list_gender = [np.argmax(x, axis=-1) for x in y_pred_gender]

    df_test["PredictGender"] = y_pred_list_gender

    y_train_age = df_train["AgeClass"].to_numpy()
    y_test_age = df_test["AgeClass"].to_numpy()

    y_train_age = keras.utils.to_categorical(y_train_age,num_classes=3)
    y_test_age_cat = keras.utils.to_categorical(y_test_age,num_classes=3)

    model_age = Sequential()
    model_age.add(Conv1D(filters = 100, 
                        kernel_size = 3,
                        strides = 1, 
                        activation = 'relu', 
                        input_shape = (size, 1) ))
    model_age.add(MaxPooling1D(pool_size = 4, strides = 1))
    model_age.add(Activation('relu')) # add layer
    model_age.add(Flatten())
    model_age.add(Dropout(0.4))
    model_age.add(Dense(units = 512, activation = 'relu')) #add relu
    model_age.add(Dense(units = 3, activation = 'softmax'))
    model_age.compile(loss='categorical_crossentropy', optimizer = Adadelta(
        learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])

    model_age.fit(X_train,y_train_age,validation_data=(X_test,y_test_age_cat), batch_size=32)

    y_pred_age = model_age.predict(
        X_test
    )

    y_pred_list_age = [np.argmax(x, axis=-1) for x in y_pred_age]

    print("juntos: ", end="")
    print(metrics.f1_score(y_test_age, y_pred_list_age, average='macro'))
    f.append(metrics.f1_score(y_test_age, y_pred_list_age, average='macro'))

    df_train_fem = df_train[df_train["GenderClass"]==1]
    df_test_fem = df_test[df_test["PredictGender"]==1]

    X_train_fem = df_train_fem["Texts"].apply(clean_text).to_numpy()
    X_test_fem = df_test_fem["Texts"].apply(clean_text).to_numpy()
    y_train_fem = df_train_fem["AgeClass"].to_numpy()
    y_test_fem = df_test_fem["AgeClass"].to_numpy()

    num_words_fem = []
    for text in (X_train_fem.tolist()+X_test_fem.tolist()):
        num_words_fem.append(len(text.split()))

    mean_fem = sum(num_words_fem)//len(num_words_fem)

    train_texts_fem = X_train_fem.tolist()
    test_texts_fem = X_test_fem.tolist()

    tfidfvec_fem = TfidfVectorizer(max_features = mean_fem, max_df=0.9)
    tfidfvec_fem.fit(train_texts_fem)
    tfidf_train_fem = tfidfvec_fem.transform(train_texts_fem).toarray()
    tfidf_test_fem = tfidfvec_fem.transform(test_texts_fem).toarray()

    X_train_fem = tfidf_train_fem.reshape(tfidf_train_fem.shape[0],tfidf_train_fem.shape[1],1)
    X_test_fem = tfidf_test_fem.reshape(tfidf_test_fem.shape[0],tfidf_test_fem.shape[1],1)

    y_train_fem = keras.utils.to_categorical(y_train_fem,num_classes=3)
    y_test_fem_cat = keras.utils.to_categorical(y_test_fem,num_classes=3)

    size_fem = X_test_fem.shape[1]

    model_fem = Sequential()
    model_fem.add(Conv1D(filters = 100, 
                        kernel_size = 3,
                        strides = 1, 
                        activation = 'relu', 
                        input_shape = (size_fem, 1) ))
    model_fem.add(MaxPooling1D(pool_size = 4, strides = 1))
    model_fem.add(Activation('relu')) # add layer
    model_fem.add(Flatten())
    model_fem.add(Dropout(0.4))
    model_fem.add(Dense(units = 512, activation = 'relu')) #add relu
    model_fem.add(Dense(units = 3, activation = 'softmax'))

    model_fem.compile(loss='categorical_crossentropy', optimizer = Adadelta(
        learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])

    model_fem.fit(X_train_fem,y_train_fem,validation_data=(X_test_fem, y_test_fem_cat), batch_size=32)

    y_pred_fem = model_fem.predict(
        X_test_fem
    )

    y_pred_list_fem = [np.argmax(x, axis=-1) for x in y_pred_fem]
    print("fem: ", end="")
    print(metrics.f1_score(y_test_fem, y_pred_list_fem, average='macro'))
    l_fem.append(metrics.f1_score(y_test_fem, y_pred_list_fem, average='macro'))


    df_train_masc = df_train[df_train["GenderClass"]==0]
    df_test_masc = df_test[df_test["PredictGender"]==0]

    X_train_masc = df_train_masc["Texts"].apply(clean_text).to_numpy()
    X_test_masc = df_test_masc["Texts"].apply(clean_text).to_numpy()
    y_train_masc = df_train_masc["AgeClass"].to_numpy()
    y_test_masc = df_test_masc["AgeClass"].to_numpy()

    num_words_masc = []
    for text in (X_train_masc.tolist()+X_test_masc.tolist()):
        num_words_masc.append(len(text.split()))

    mean_masc = sum(num_words_masc)//len(num_words_masc)

    train_texts_masc = X_train_masc.tolist()
    test_texts_masc = X_test_masc.tolist()

    tfidfvec_masc = TfidfVectorizer(max_features = mean_masc, max_df=0.9)
    tfidfvec_masc.fit(train_texts_masc)
    tfidf_train_masc = tfidfvec_masc.transform(train_texts_masc).toarray()
    tfidf_test_masc = tfidfvec_masc.transform(test_texts_masc).toarray()

    X_train_masc = tfidf_train_masc.reshape(tfidf_train_masc.shape[0],tfidf_train_masc.shape[1],1)
    X_test_masc = tfidf_test_masc.reshape(tfidf_test_masc.shape[0],tfidf_test_masc.shape[1],1)

    y_train_masc = keras.utils.to_categorical(y_train_masc,num_classes=3)
    y_test_masc_cat = keras.utils.to_categorical(y_test_masc,num_classes=3)

    size_masc = X_test_masc.shape[1]

    model_masc = Sequential()
    model_masc.add(Conv1D(filters = 100, 
                        kernel_size = 3,
                        strides = 1, 
                        activation = 'relu', 
                        input_shape = (size_masc, 1) ))
    model_masc.add(MaxPooling1D(pool_size = 4, strides = 1))
    model_masc.add(Activation('relu')) # add layer
    model_masc.add(Flatten())
    model_masc.add(Dropout(0.4))
    model_masc.add(Dense(units = 512, activation = 'relu')) #add relu
    model_masc.add(Dense(units = 3, activation = 'softmax'))

    model_masc.compile(loss='categorical_crossentropy', optimizer = Adadelta(
        learning_rate=1, name="Adadelta"
    ), metrics = ['accuracy'])

    model_masc.fit(X_train_masc,y_train_masc,validation_data=(X_test_masc,y_test_masc_cat), batch_size=32)

    y_pred_masc = model_masc.predict(
        X_test_masc
    )

    y_pred_list_masc = [np.argmax(x, axis=-1) for x in y_pred_masc]

    print("masc: ", end="")
    print(metrics.f1_score(y_test_masc, y_pred_list_masc, average='macro'))
    l_masc.append(metrics.f1_score(y_test_masc, y_pred_list_masc, average='macro'))

    y_test_sep = y_test_fem.tolist() + y_test_masc.tolist()
    y_pred_list_sep = y_pred_list_fem + y_pred_list_masc
    print("separado: ", end="")
    print(metrics.f1_score(y_test_sep, y_pred_list_sep, average='macro'))
    h_total.append(metrics.f1_score(y_test_sep, y_pred_list_sep, average='macro'))

print("\n\n\n\n\n\nRESULTADOS FINAIS")
print("flat: ", end="")
print(sum(f)/len(f))
print("local fem: ", end="")
print(sum(l_fem)/len(l_fem))
print("local masc: ", end="")
print(sum(l_masc)/len(l_masc))
print("hierq: ", end="")
print(sum(h_total)/len(h_total))

print("\n\n\n\n\n\nVETORES")
print("flat: ", end="")
print(f)
print("local fem: ", end="")
print(l_fem)
print("local masc: ", end="")
print(l_masc)
print("hierq: ", end="")
print(h_total)

juntos: 0.4337421742227465
fem: 0.3980842911877394
masc: 0.3063554802685237
separado: 0.40223775987569876
juntos: 0.40730170532075355
fem: 0.37423946483625664
masc: 0.34371643394199786
separado: 0.4257702079938321
juntos: 0.42873082520284944
fem: 0.3412698412698412
masc: 0.3289549772821148
separado: 0.41055782445295214
juntos: 0.40638576195135245
fem: 0.35371318822023046
masc: 0.33571393145861234
separado: 0.4143108220682059
juntos: 0.45169742730209905
fem: 0.4342227202519761
masc: 0.3506219692177946
separado: 0.45028180354267316
juntos: 0.43384843369964105
fem: 0.3950756558331096
masc: 0.313433538641201
separado: 0.42923219607127655
juntos: 0.42882528385513563
fem: 0.39179621484291943
masc: 0.3387790775850477
separado: 0.4292905270342742
juntos: 0.4300741235396484
fem: 0.39075630252100835
masc: 0.3369458128078818
separado: 0.4287071722932814
juntos: 0.4201165516321892
fem: 0.4405844878043779
masc: 0.33377279718743136
separado: 0.44620366542648476
juntos: 0.43549298569509537
fem: 0.392