### Importações e Funcções

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alici\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [5]:
import json

In [6]:
def createdfs(i):
    df_train = pd.read_csv(r"C:\Users\alici\Documents\tcc\v3\particoes\b5\k_"+str(i)+"/train.csv")
    df_test = pd.read_csv(r"C:\Users\alici\Documents\tcc\v3\particoes\b5\k_"+str(i)+"/test.csv")
    df_train["Transf_Text"] = df_train["Text"].apply((lambda x: (str(x)).split("#")))
    df_test["Transf_Text"] = df_test["Text"].apply((lambda x: (str(x)).split("#")))
    return df_train, df_test

In [7]:
def remove_stopwords(texts):
    result = []
    for text in texts:
        all_words = text.split(" ")
        clean_text = [i for i in all_words if i not in stopwords and i!=""]
        result.append(" ".join(clean_text))
    return result

In [8]:
def getstringtext(df):
    texts = df["CleanText"].to_list()
    string_text = []
    for l in texts:
        res = " ".join(l)
        res = res.replace("\"", "")
        string_text.append(res)
    return string_text

In [9]:
def genderGroups(text):
    if(text == "f"):
        return 1
    else:
        return 0

In [10]:
def formatTime(seg):
    min = seg//60
    seg = seg % 60
    return str(min)+"min"+str(round(seg, 2))+"s"

In [11]:
def mean_gender(f1s):
    soma = [0, 0]
    for a1, a2 in f1s:
        soma[0] += a1
        soma[1] += a2
    return [(soma[0]/len(f1s)), (soma[1]/len(f1s))]

In [12]:
def mean_age(f1s):
    soma = [0, 0, 0]
    for a1, a2, a3 in f1s:
        soma[0] += a1
        soma[1] += a2
        soma[2] += a3
    return [(soma[0]/len(f1s)), (soma[1]/len(f1s)), (soma[2]/len(f1s))]

In [13]:
with open(r'C:\Users\alici\Documents\tcc\v3\melhores_param\b5\gender.json', encoding='utf-8') as f:
    gender_param = json.load(f)

In [14]:
with open(r'C:\Users\alici\Documents\tcc\v3\melhores_param\b5\age.json', encoding='utf-8') as f:
    age_param = json.load(f)

In [15]:
with open(r'C:\Users\alici\Documents\tcc\v3\melhores_param\b5\age_fem.json', encoding='utf-8') as f:
    age_fem_param = json.load(f)

In [16]:
with open(r'C:\Users\alici\Documents\tcc\v3\melhores_param\b5\age_masc.json', encoding='utf-8') as f:
    age_masc_param = json.load(f)

In [17]:
from warnings import filterwarnings
filterwarnings('ignore')

### Hierárquica (completa)

In [18]:
# melhores parametros
ages_macro = []
ages_fem_macro = []
ages_masc_macro = []
ages_sep_macro = []
gender_macro = []

for i in range(10):
    df_train_gender, df_test_gender = createdfs(i+1)

    df_train_gender['CleanText'] = df_train_gender['Transf_Text'].apply(remove_stopwords)
    df_test_gender['CleanText'] = df_test_gender['Transf_Text'].apply(remove_stopwords)

    df_train_gender["GenderClass"] = df_train_gender["ActualGender"].apply(genderGroups)
    df_test_gender["GenderClass"] = df_test_gender["ActualGender"].apply(genderGroups)

    train_texts_gender = getstringtext(df_train_gender)
    test_texts_gender = getstringtext(df_test_gender)

    tfvec = TfidfVectorizer(max_features = 3000)
    tfvec.fit(train_texts_gender)
    tdf_train = tfvec.transform(train_texts_gender).toarray().tolist()
    tdf_test = tfvec.transform(test_texts_gender).toarray().tolist()

    gender_train = df_train_gender["GenderClass"].to_list()
    gender_test = df_test_gender["GenderClass"].to_list()

    logisticRegr_gender = LogisticRegression(penalty=gender_param["k_"+str(i+1)]["params"]["penalty"], C=gender_param["k_"+str(i+1)]["params"]["C"])
    logisticRegr_gender.fit(tdf_train, gender_train)

    pred_gender=logisticRegr_gender.predict(tdf_test)
    gender_macro.append(metrics.f1_score(gender_test, pred_gender, average='macro'))

    #genero
    df_test_gender["PredictGender"] = pred_gender

    # separa as partições de teste com base no predito

    df_train_fem = df_train_gender[df_train_gender["GenderClass"]==1]
    df_train_masc = df_train_gender[df_train_gender["GenderClass"]==0]

    df_test_fem = df_test_gender[df_test_gender["PredictGender"]==1]
    df_test_masc = df_test_gender[df_test_gender["PredictGender"]==0]

    age_train = df_train_gender["AgeClass"].to_list()
    age_test = df_test_gender["AgeClass"].to_list()
    logisticRegr = LogisticRegression(penalty=age_param["k_"+str(i+1)]["params"]["penalty"], C=age_param["k_"+str(i+1)]["params"]["C"])
    logisticRegr.fit(tdf_train, age_train)
    pred_age=logisticRegr.predict(tdf_test)
    ages_macro.append(metrics.f1_score(age_test, pred_age, average='macro'))

    train_texts_fem = getstringtext(df_train_fem)
    test_texts_fem = getstringtext(df_test_fem)
    tfvec_fem = TfidfVectorizer(max_features = 3000)
    tfvec_fem.fit(train_texts_fem)
    tdf_train_fem = tfvec_fem.transform(train_texts_fem).toarray().tolist()
    tdf_test_fem = tfvec_fem.transform(test_texts_fem).toarray().tolist()
    age_train_fem = df_train_fem["AgeClass"].to_list()
    age_test_fem = df_test_fem["AgeClass"].to_list()
    logisticRegr_fem = LogisticRegression(penalty=age_fem_param["k_"+str(i+1)]["params"]["penalty"], C=age_fem_param["k_"+str(i+1)]["params"]["C"])
    logisticRegr_fem.fit(tdf_train_fem, age_train_fem)
    pred_age_fem=logisticRegr_fem.predict(tdf_test_fem)
    ages_fem_macro.append(metrics.f1_score(age_test_fem, pred_age_fem, average='macro'))

    train_texts_masc = getstringtext(df_train_masc)
    test_texts_masc = getstringtext(df_test_masc)
    tfvec_masc = TfidfVectorizer(max_features = 3000)
    tfvec_masc.fit(train_texts_masc)
    tdf_train_masc = tfvec_masc.transform(train_texts_masc).toarray().tolist()
    tdf_test_masc = tfvec_masc.transform(test_texts_masc).toarray().tolist()
    age_train_masc = df_train_masc["AgeClass"].to_list()
    age_test_masc = df_test_masc["AgeClass"].to_list()
    logisticRegr_masc = LogisticRegression(penalty=age_masc_param["k_"+str(i+1)]["params"]["penalty"], C=age_masc_param["k_"+str(i+1)]["params"]["C"])
    logisticRegr_masc.fit(tdf_train_masc, age_train_masc)
    pred_age_masc=logisticRegr_masc.predict(tdf_test_masc)
    ages_masc_macro.append(metrics.f1_score(age_test_masc, pred_age_masc, average='macro'))

    pred_ages_sep = pred_age_fem.tolist() + pred_age_masc.tolist()
    y_test = age_test_fem + age_test_masc
    ages_sep_macro.append(metrics.f1_score(y_test, pred_ages_sep, average='macro'))

print()
print("genero:", end=" ")
print(sum(gender_macro)/len(gender_macro))
print("juntos:", end=" ")
print(sum(ages_macro)/len(ages_macro))
print("só fem:", end=" ")
print(sum(ages_fem_macro)/len(ages_fem_macro))
print("só masc:", end=" ")
print(sum(ages_masc_macro)/len((ages_masc_macro)))
print("separado:", end=" ")
print(sum(ages_sep_macro)/len(ages_sep_macro))

print("\n\n\nVETORES")
print(ages_macro)
print(ages_sep_macro)


genero: 0.8718810949029738
juntos: 0.5692717805080896
só fem: 0.6150094645536831
só masc: 0.5430929061856283
separado: 0.6013560013776474



VETORES
[0.4599326599326599, 0.4323986604688359, 0.6906906906906908, 0.5885878489326765, 0.5705329153605015, 0.5990750108397167, 0.5553814002089864, 0.5677919298608952, 0.6553798288825512, 0.5729468599033817]
[0.5880209106350698, 0.4738477134121417, 0.7124292946783423, 0.46784696784696783, 0.600250626566416, 0.5775144243377918, 0.5918589577391756, 0.6255952380952381, 0.6681539224233722, 0.708041958041958]


In [19]:
np.std(ages_macro), np.std(ages_sep_macro)

(0.07370075600177396, 0.07980561343233857)

In [20]:
from scipy.stats import wilcoxon
wilcoxon(np.array(ages_macro), np.array(ages_sep_macro), alternative='greater')

WilcoxonResult(statistic=10.0, pvalue=0.9677734375)

In [24]:
# literatura
ages_macro = []
ages_fem_macro = []
ages_masc_macro = []
ages_sep_macro = []
gender_macro = []

for i in range(10):
    df_train_gender, df_test_gender = createdfs(i+1)

    df_train_gender['CleanText'] = df_train_gender['Transf_Text'].apply(remove_stopwords)
    df_test_gender['CleanText'] = df_test_gender['Transf_Text'].apply(remove_stopwords)

    df_train_gender["GenderClass"] = df_train_gender["ActualGender"].apply(genderGroups)
    df_test_gender["GenderClass"] = df_test_gender["ActualGender"].apply(genderGroups)

    train_texts_gender = getstringtext(df_train_gender)
    test_texts_gender = getstringtext(df_test_gender)

    tfvec = TfidfVectorizer(max_features = 3000)
    tfvec.fit(train_texts_gender)
    tdf_train = tfvec.transform(train_texts_gender).toarray().tolist()
    tdf_test = tfvec.transform(test_texts_gender).toarray().tolist()

    gender_train = df_train_gender["GenderClass"].to_list()
    gender_test = df_test_gender["GenderClass"].to_list()

    logisticRegr_gender = LogisticRegression(penalty="l2", C=1000)
    logisticRegr_gender.fit(tdf_train, gender_train)

    pred_gender=logisticRegr_gender.predict(tdf_test)
    gender_macro.append(metrics.f1_score(gender_test, pred_gender, average='macro'))

    #genero
    df_test_gender["PredictGender"] = pred_gender

    # separa as partições de teste com base no predito

    df_train_fem = df_train_gender[df_train_gender["GenderClass"]==1]
    df_train_masc = df_train_gender[df_train_gender["GenderClass"]==0]

    df_test_fem = df_test_gender[df_test_gender["PredictGender"]==1]
    df_test_masc = df_test_gender[df_test_gender["PredictGender"]==0]

    age_train = df_train_gender["AgeClass"].to_list()
    age_test = df_test_gender["AgeClass"].to_list()
    logisticRegr = LogisticRegression(penalty="l2", C=1000)
    logisticRegr.fit(tdf_train, age_train)
    pred_age=logisticRegr.predict(tdf_test)
    ages_macro.append(metrics.f1_score(age_test, pred_age, average='macro'))

    train_texts_fem = getstringtext(df_train_fem)
    test_texts_fem = getstringtext(df_test_fem)
    tfvec_fem = TfidfVectorizer(max_features = 3000)
    tfvec_fem.fit(train_texts_fem)
    tdf_train_fem = tfvec_fem.transform(train_texts_fem).toarray().tolist()
    tdf_test_fem = tfvec_fem.transform(test_texts_fem).toarray().tolist()
    age_train_fem = df_train_fem["AgeClass"].to_list()
    age_test_fem = df_test_fem["AgeClass"].to_list()
    logisticRegr_fem = LogisticRegression(penalty="l2", C=1000)
    logisticRegr_fem.fit(tdf_train_fem, age_train_fem)
    pred_age_fem=logisticRegr_fem.predict(tdf_test_fem)
    ages_fem_macro.append(metrics.f1_score(age_test_fem, pred_age_fem, average='macro'))

    train_texts_masc = getstringtext(df_train_masc)
    test_texts_masc = getstringtext(df_test_masc)
    tfvec_masc = TfidfVectorizer(max_features = 3000)
    tfvec_masc.fit(train_texts_masc)
    tdf_train_masc = tfvec_masc.transform(train_texts_masc).toarray().tolist()
    tdf_test_masc = tfvec_masc.transform(test_texts_masc).toarray().tolist()
    age_train_masc = df_train_masc["AgeClass"].to_list()
    age_test_masc = df_test_masc["AgeClass"].to_list()
    logisticRegr_masc = LogisticRegression(penalty="l2", C=1000)
    logisticRegr_masc.fit(tdf_train_masc, age_train_masc)
    pred_age_masc=logisticRegr_masc.predict(tdf_test_masc)
    ages_masc_macro.append(metrics.f1_score(age_test_masc, pred_age_masc, average='macro'))

    pred_ages_sep = pred_age_fem.tolist() + pred_age_masc.tolist()
    y_test = age_test_fem + age_test_masc
    ages_sep_macro.append(metrics.f1_score(y_test, pred_ages_sep, average='macro'))

print()
print("genero:", end=" ")
print(sum(gender_macro)/len(gender_macro))
print("juntos:", end=" ")
print(sum(ages_macro)/len(ages_macro))
print("só fem:", end=" ")
print(sum(ages_fem_macro)/len(ages_fem_macro))
print("só masc:", end=" ")
print(sum(ages_masc_macro)/len((ages_masc_macro)))
print("separado:", end=" ")
print(sum(ages_sep_macro)/len(ages_sep_macro))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


genero: 0.8739190182660759
juntos: 0.5805187869194488
só fem: 0.5987048507571457
só masc: 0.5266044410812212
separado: 0.5857926349353009


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
