In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import nltk
import numpy as np
from sklearn.model_selection import GridSearchCV
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alici\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def remove_stopwords(text):
    all_words = text.split(" ")
    clean_text = [i for i in all_words if i not in stopwords and i!=""]
    return " ".join(clean_text)

In [3]:
def remove_html_tags(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [4]:
def lower_texts(text):
    return text.lower()

In [5]:
def clean_text(text):
    clean_text = remove_stopwords(text)
    clean_text = remove_html_tags(clean_text)
    clean_text = lower_texts(clean_text)
    return clean_text

In [6]:
def formatTime(seg):
    min = seg//60
    seg = seg % 60
    return str(min)+"min"+str(round(seg, 2))+"s"

In [28]:
age = []
age_fem = []
age_masc = []
age_sep = []

for i in range(10):
    df_train = pd.read_csv(r'C:\Users\alici\Documents\tcc\github2\tcc-v2\brmoral\particoes\k_'+str(i+1)+r'\train.csv')
    df_test = pd.read_csv(r'C:\Users\alici\Documents\tcc\github2\tcc-v2\brmoral\particoes\k_'+str(i+1)+r'\test.csv')
    
    X_train = df_train['concat'].apply(clean_text).to_numpy()
    X_test = df_test['concat'].apply(clean_text).to_numpy()
    y_train_gender = df_train['GenderClass']
    y_test_gender = df_test['GenderClass']

    tfidfvec_gender = TfidfVectorizer(max_df=0.8)
    tfidf_train_gender = tfidfvec_gender.fit_transform(X_train)
    tfidf_test_gender = tfidfvec_gender.transform(X_test)

    logisticRegr_gender = LogisticRegression(penalty='l2', C=5714.28, solver='lbfgs', multi_class='multinomial')
    logisticRegr_gender.fit(tfidf_train_gender, y_train_gender)
    pred_gender=logisticRegr_gender.predict(tfidf_test_gender)

    # f1 = metrics.f1_score(y_test_gender, pred_gender, average='macro')
    # f1_list.append(f1)
    # print(f1)

    y_train_age = df_train['AgeClass']
    y_test_age = df_test['AgeClass']

    tfidfvec_age = TfidfVectorizer(max_features = 1000, max_df=0.8)
    tfidf_train_age = tfidfvec_age.fit_transform(X_train)
    tfidf_test_age = tfidfvec_age.transform(X_test)

    logisticRegr_age = LogisticRegression(penalty='l2', C=2857.14, solver='liblinear')
    logisticRegr_age.fit(tfidf_train_age, y_train_age)
    pred_age=logisticRegr_age.predict(tfidf_test_age)

    f1 = metrics.f1_score(y_test_age, pred_age, average='macro')
    age.append(f1)

    df_test["PredictGender"] = pred_gender

    df_train_fem = df_train[df_train["GenderClass"]==1]
    df_train_masc = df_train[df_train["GenderClass"]==0]

    df_test_fem = df_test[df_test["PredictGender"]==1]
    df_test_masc = df_test[df_test["PredictGender"]==0]

    X_train_fem = df_train_fem['concat'].apply(clean_text).to_numpy()
    X_test_fem = df_test_fem['concat'].apply(clean_text).to_numpy()
    y_train_fem = df_train_fem['AgeClass']
    y_test_fem = df_test_fem['AgeClass']

    tfidfvec_fem = TfidfVectorizer(max_features = 1000, max_df=0.8)
    tfidf_train_fem = tfidfvec_fem.fit_transform(X_train_fem)
    tfidf_test_fem = tfidfvec_fem.transform(X_test_fem)

    logisticRegr_fem = LogisticRegression(penalty='l2', C=2857.14, solver='liblinear')
    logisticRegr_fem.fit(tfidf_train_fem, y_train_fem)
    pred_fem=logisticRegr_fem.predict(tfidf_test_fem)

    f1 = metrics.f1_score(y_test_fem, pred_fem, average='macro')
    age_fem.append(f1)

    X_train_masc = df_train_masc['concat'].apply(clean_text).to_numpy()
    X_test_masc = df_test_masc['concat'].apply(clean_text).to_numpy()
    y_train_masc = df_train_masc['AgeClass']
    y_test_masc = df_test_masc['AgeClass']

    tfidfvec_masc = TfidfVectorizer(max_features = 1000, max_df=0.8)
    tfidf_train_masc = tfidfvec_masc.fit_transform(X_train_masc)
    tfidf_test_masc = tfidfvec_masc.transform(X_test_masc)

    logisticRegr_masc = LogisticRegression(penalty='l2', C=2857.14, solver='liblinear')
    logisticRegr_masc.fit(tfidf_train_masc, y_train_masc)
    pred_masc=logisticRegr_masc.predict(tfidf_test_masc)

    f1 = metrics.f1_score(y_test_masc, pred_masc, average='macro')
    age_masc.append(f1)

    pred_ages_sep = pred_fem.tolist() + pred_masc.tolist()
    y_test = y_test_fem.to_list() +  y_test_masc.to_list()
    age_sep.append(metrics.f1_score(y_test, pred_ages_sep, average='macro'))


print()
print(sum(age)/len(age))
print(sum(age_fem)/len(age_fem))
print(sum(age_masc)/len(age_masc))
print(sum(age_sep)/len(age_sep))


0.45951650051155857
0.4202717652717653
0.4704187039928619
0.4803769480102453


In [None]:
# GridSearch para genero
import json
import time

best_param_dict = dict()

start = time.time()

grid = dict()
grid["C"] = (np.logspace(-3,3,7).tolist() + [5714.28]) 
grid["penalty"] = ["l2", "l1", "none"]
best_param_dict["grid_values"] = grid

for i in range(10):
    df_train = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/train.csv')
    df_test = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/test.csv')
    
    X_train = df_train['concat'].apply(clean_text).to_numpy()
    X_test = df_test['concat'].apply(clean_text).to_numpy()
    y_train_gender = df_train['GenderClass']
    y_test_gender = df_test['GenderClass']

    tfidfvec_gender = TfidfVectorizer(max_df=0.8)
    tfidf_train_gender = tfidfvec_gender.fit_transform(X_train)
    tfidf_test_gender = tfidfvec_gender.transform(X_test)

    logreg=LogisticRegression(solver='lbfgs', multi_class='multinomial')
    logreg_cv=GridSearchCV(logreg,grid,cv=10, scoring='f1_macro')
    logreg_cv.fit(tfidf_train_gender, y_train_gender)
    
    param_dict = dict()
    param_dict["params"] =  logreg_cv.best_params_
    param_dict["f1"] = logreg_cv.best_score_
    best_param_dict["k_"+str(i+1)] = param_dict

    print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
    print("f1 :",logreg_cv.best_score_)

end = time.time()
best_param_dict["time"] = formatTime(end-start)

with open('brmoral/gender.json', 'w') as f:
    json.dump(best_param_dict, f, indent=4)
    

In [10]:
with open('brmoral/gender.json', encoding='utf-8') as f:
    gender_param = json.load(f)

In [15]:
# GridSearch para idade
import json
import time

best_param_dict = dict()

start = time.time()

grid = dict()
grid["C"] = (np.logspace(-3,3,7).tolist() + [2857.14])
grid["penalty"] = ["l1", "l2"]
best_param_dict["grid_values"] = grid

for i in range(10):
    df_train = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/train.csv')
    df_test = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/test.csv')
    
    X_train = df_train['concat'].apply(clean_text).to_numpy()
    X_test = df_test['concat'].apply(clean_text).to_numpy()

    y_train_age = df_train['AgeClass']
    y_test_age = df_test['AgeClass']

    tfidfvec_age = TfidfVectorizer(max_features = 1000, max_df=0.8)
    tfidf_train_age = tfidfvec_age.fit_transform(X_train)
    tfidf_test_age = tfidfvec_age.transform(X_test)

    logreg=LogisticRegression(solver='liblinear')
    logreg_cv=GridSearchCV(logreg,grid,cv=10, scoring='f1_macro')
    logreg_cv.fit(tfidf_test_age, y_test_age)
    
    param_dict = dict()
    param_dict["params"] =  logreg_cv.best_params_
    param_dict["f1"] = logreg_cv.best_score_
    best_param_dict["k_"+str(i+1)] = param_dict

    print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
    print("f1 :",logreg_cv.best_score_)

end = time.time()
best_param_dict["time"] = formatTime(end-start)

with open('brmoral/age.json', 'w') as f:
    json.dump(best_param_dict, f, indent=4)
    



tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.24888888888888888
tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l1'}
f1 : 0.28253968253968254




tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l2'}
f1 : 0.40444444444444444




tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l1'}
f1 : 0.36682539682539683




tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l1'}
f1 : 0.5222222222222223




tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}
f1 : 0.325079365079365




tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l2'}
f1 : 0.27222222222222225




tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l1'}
f1 : 0.35777777777777786




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l1'}
f1 : 0.2923809523809524




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.3077777777777778


In [16]:
# GridSearch para fem
import json
import time

best_param_dict = dict()

start = time.time()

grid = dict()
grid["C"] = (np.logspace(-3,3,7).tolist() + [2857.14])
grid["penalty"] = ["l1", "l2"]
best_param_dict["grid_values"] = grid

for i in range(10):
    df_train = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/train.csv')
    df_test = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/test.csv')

    X_train = df_train['concat'].apply(clean_text).to_numpy()
    X_test = df_test['concat'].apply(clean_text).to_numpy()
    y_train_gender = df_train['GenderClass']
    y_test_gender = df_test['GenderClass']

    tfidfvec_gender = TfidfVectorizer(max_df=0.8)
    tfidf_train_gender = tfidfvec_gender.fit_transform(X_train)
    tfidf_test_gender = tfidfvec_gender.transform(X_test)

    logisticRegr_gender = LogisticRegression(penalty=gender_param["k_"+str(i+1)]["params"]["penalty"], C=gender_param["k_"+str(i+1)]["params"]["C"], solver='lbfgs', multi_class='multinomial')
    logisticRegr_gender.fit(tfidf_train_gender, y_train_gender)
    pred_gender=logisticRegr_gender.predict(tfidf_test_gender)
    
    df_test["PredictGender"] = pred_gender

    df_train_fem = df_train[df_train["GenderClass"]==1]
    df_test_fem = df_test[df_test["PredictGender"]==1]

    X_train_fem = df_train_fem['concat'].apply(clean_text).to_numpy()
    X_test_fem = df_test_fem['concat'].apply(clean_text).to_numpy()
    y_train_fem = df_train_fem['AgeClass']
    y_test_fem = df_test_fem['AgeClass']

    tfidfvec_fem = TfidfVectorizer(max_features = 1000, max_df=0.8)
    tfidf_train_fem = tfidfvec_fem.fit_transform(X_train_fem)
    tfidf_test_fem = tfidfvec_fem.transform(X_test_fem)

    logreg=LogisticRegression(solver='liblinear')
    logreg_cv=GridSearchCV(logreg,grid,cv=10, scoring='f1_macro')
    logreg_cv.fit(tfidf_train_fem, y_train_fem)
    
    param_dict = dict()
    param_dict["params"] =  logreg_cv.best_params_
    param_dict["f1"] = logreg_cv.best_score_
    best_param_dict["k_"+str(i+1)] = param_dict

    print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
    print("f1 :",logreg_cv.best_score_)

end = time.time()
best_param_dict["time"] = formatTime(end-start)

with open('brmoral/age_fem.json', 'w') as f:
    json.dump(best_param_dict, f, indent=4)
    

tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l2'}
f1 : 0.4087347837347838
tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l1'}
f1 : 0.4060814185814186




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.4339519739519739
tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l1'}
f1 : 0.42203463203463204




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.3918275950628892




tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}
f1 : 0.39372625196154604




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.47868141118141116




tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l1'}
f1 : 0.41926776926776926




tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2'}
f1 : 0.38539735777351875




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.43689086512615927


In [21]:
# GridSearch para masc
import json
import time

best_param_dict = dict()

start = time.time()

grid = dict()
grid["C"] = (np.logspace(-3,3,7).tolist() + [2857.14])
grid["penalty"] = ["l1", "l2"]
best_param_dict["grid_values"] = grid

for i in range(10):
    df_train = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/train.csv')
    df_test = pd.read_csv('C:/Users/alici/Documents/tcc/github2/tcc-v2/brmoral/particoes/k_'+str(i+1)+'/test.csv')

    X_train = df_train['concat'].apply(clean_text).to_numpy()
    X_test = df_test['concat'].apply(clean_text).to_numpy()
    y_train_gender = df_train['GenderClass']
    y_test_gender = df_test['GenderClass']

    tfidfvec_gender = TfidfVectorizer(max_df=0.8)
    tfidf_train_gender = tfidfvec_gender.fit_transform(X_train)
    tfidf_test_gender = tfidfvec_gender.transform(X_test)

    logisticRegr_gender = LogisticRegression(penalty=gender_param["k_"+str(i+1)]["params"]["penalty"], C=gender_param["k_"+str(i+1)]["params"]["C"], solver='lbfgs', multi_class='multinomial')
    logisticRegr_gender.fit(tfidf_train_gender, y_train_gender)
    pred_gender=logisticRegr_gender.predict(tfidf_test_gender)
    
    df_test["PredictGender"] = pred_gender

    df_train_masc = df_train[df_train["GenderClass"]==0]
    df_test_masc = df_test[df_test["PredictGender"]==0]

    X_train_masc = df_train_masc['concat'].apply(clean_text).to_numpy()
    X_test_masc = df_test_masc['concat'].apply(clean_text).to_numpy()
    y_train_masc = df_train_masc['AgeClass']
    y_test_masc = df_test_masc['AgeClass']

    tfidfvec_masc = TfidfVectorizer(max_features = 1000, max_df=0.8)
    tfidf_train_masc = tfidfvec_masc.fit_transform(X_train_masc)
    tfidf_test_masc = tfidfvec_masc.transform(X_test_masc)

    logreg=LogisticRegression(solver='liblinear')
    logreg_cv=GridSearchCV(logreg,grid,cv=10, scoring='f1_macro')
    logreg_cv.fit(tfidf_train_masc, y_train_masc)
    
    param_dict = dict()
    param_dict["params"] =  logreg_cv.best_params_
    param_dict["f1"] = logreg_cv.best_score_
    best_param_dict["k_"+str(i+1)] = param_dict

    print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
    print("f1 :",logreg_cv.best_score_)

end = time.time()
best_param_dict["time"] = formatTime(end-start)

with open('brmoral/age_masc.json', 'w') as f:
    json.dump(best_param_dict, f, indent=4)
    

tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l1'}
f1 : 0.4882407832485951
tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l1'}
f1 : 0.4316928539062294




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.43477982073473803
tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l2'}
f1 : 0.4264199152809553




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l1'}
f1 : 0.48593073499440437




tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
f1 : 0.47196111269153695




tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l2'}
f1 : 0.42196352578213264




tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}
f1 : 0.4408983203831306




tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}
f1 : 0.4178676904933777




tuned hpyerparameters :(best parameters)  {'C': 2857.14, 'penalty': 'l1'}
f1 : 0.45266137638105597


In [14]:
def read_file_as_df(file_name):
    import pandas as pd
    import csv

    import sys
    import pandas as pd

    maxInt = sys.maxsize

    while True:
        # decrease the maxInt value by factor 10 
        # as long as the OverflowError occurs.

        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt/10)

    file = []
    col = []

    with open(file_name) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        line_count = 0
        for row in csv_reader:
            if line_count==0:
                for r in row:
                    col.append(r)
                line_count+=1
            else:
                line = []
                for r in row:
                    line.append(r)
                file.append(line)
                line_count += 1

            
    df = pd.DataFrame(file, columns = col)
    return df

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
import time

start = time.time()

grid = dict()
grid["C"] = np.logspace(-3,3,7).tolist()
grid["penalty"] = ["l1", "l2"]

df = read_file_as_df(r'C:\Users\alici\Documents\tcc\github2\tcc-v2\brmoral\brMoral.csv')
df['CleanText'] = df['concat'].apply(clean_text)
X = df['CleanText'].to_numpy()
y = df["gender"].apply(genderGroups)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)
tfvec = TfidfVectorizer(max_df=0.8)
tfvec.fit(X_train)
tdf_train = tfvec.transform(X_train).toarray().tolist()
tdf_test = tfvec.transform(X_test).toarray().tolist()
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg, grid, cv=10, scoring='f1_macro')
logreg_cv.fit(tdf_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

NameError: name 'genderGroups' is not defined

In [None]:
def genderGroups(text):
    if(text == "f"):
        return 1
    else:
        return 0