Skip to content

Commit

Permalink
fixing bugs (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
rezaBarzgar committed Feb 14, 2023
1 parent 75d1d0d commit f9df87d
Showing 1 changed file with 21 additions and 19 deletions.
40 changes: 21 additions & 19 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def read_xml(xmlfile, tagged_msgs, predators):
'text': '' if body.text is None else body.text,
'tagged_msg': 0 if tagged_msgs.loc[(tagged_msgs['conv_id'] == conv.get('id')) & (tagged_msgs['line'] == int(msg.get('line')))].empty else 1,
'tagged_conv': 0 if tagged_msgs.loc[tagged_msgs['conv_id'] == conv.get('id')].empty else 1,
'tagged_predator': None if predators.empty else (1 if author.text in predators else 0),
'tagged_predator': None if predators.empty else (1 if len(predators[predators['tagged_pred'] == author.text]) > 0 else 0),

This comment has been minimized.

Copy link
@hosseinfani

hosseinfani Feb 15, 2023

Member

@rezaBarzgar
logically, I see no difference. What was the problem with the previous codeline?

This comment has been minimized.

Copy link
@rezaBarzgar

rezaBarzgar Feb 15, 2023

Author Member

@hosseinfani
Because predators is a dataframe, author.text in predators means that is there any column in the dataframe with the authors' name? Also, we can replace it with the following statement, and it will work fine:
author.text in predators['tagged_pred'].tolist().

However, the main bug was on line 134. see that comment too.

This comment has been minimized.

Copy link
@hosseinfani

hosseinfani Feb 15, 2023

Member

I like this one better, more readable: author.text in predators['tagged_pred'].tolist()

}
dictionary_list.append(row)
return df.from_dict(dictionary_list)
Expand All @@ -83,7 +83,7 @@ def get_stats(data):
"""_summary_
Args:
data: the input is dataframe which is the output of read_xml() func
data: the input is dataframe
Returns:
_type_: _description_
Expand Down Expand Up @@ -131,24 +131,26 @@ def get_stats(data):
test_predator_id_file = f'{datapath}test/pan12-sexual-predator-identification-groundtruth-problem1.txt'
test_tagged_msgs_file = f'{datapath}test/pan12-sexual-predator-identification-groundtruth-problem2.txt'

df_train = read_xml(training_file, pd.read_csv(training_tagged_msgs_file, names=['conv_id', 'line'], sep='\t'), pd.read_csv(training_predator_id_file))
df_test = read_xml(test_file, pd.read_csv(test_tagged_msgs_file, names=['conv_id', 'line'], sep='\t'), pd.read_csv(test_predator_id_file))
df_train = read_xml(training_file, pd.read_csv(training_tagged_msgs_file, names=['conv_id', 'line'], sep='\t'), pd.read_csv(training_predator_id_file, header=None, names=['tagged_pred']))

This comment has been minimized.

Copy link
@rezaBarzgar

rezaBarzgar Feb 15, 2023

Author Member

@hosseinfani
The main bug was reading predators' IDs from the file. Since the file has no header name, pandas assume the first line is a column name for the data. As a result, we miss the first predator's id. I fixed this bug by adding header=None, names=['tagged_pred'].

This comment has been minimized.

Copy link
@hosseinfani

hosseinfani Feb 15, 2023

Member

I think you have to do the same for the tagged_msgs_file, right?

This comment has been minimized.

Copy link
@hosseinfani

hosseinfani Feb 15, 2023

Member

@rezaBarzgar
Thanks btw.

This comment has been minimized.

Copy link
@rezaBarzgar

rezaBarzgar Feb 15, 2023

Author Member

@hosseinfani
Yes, I will edit that too.

df_train.to_csv(f"{datapath}train.csv")
df_test = read_xml(test_file, pd.read_csv(test_tagged_msgs_file, names=['conv_id', 'line'], sep='\t'), pd.read_csv(test_predator_id_file, header=None, names=['tagged_pred']))
df_test.to_csv(f"{datapath}test.csv")
df_train_test = pd.concat([df_train, df_test])


# text_feature_sets = [["w2v_glove","prv_cat","nauthors", "time","count", "msg_line"]]
# Baselines = [msg_classifier()]#text_features, [len(df_train), len(df_test)], relabeling, df_train_test)]#, conv_msg_classifier(relabeling)]
#
# for text_feature_set in text_feature_sets:
# text_feature_set_str = '.'.join(text_feature_set)
# text_features = ef.extract_load_text_features(df_train_test, text_feature_set, f'../output/{text_feature_set_str}.npz')
#
# for baseline in Baselines:
# baseline.main(df_train_test, text_features, "../output/", text_feature_set_str)
#
# # 'tagged_msg': original labels (conv, msg_line) only available for test set
# # 'tagged_predator_bc': if conv has at least one predator, all the msgs of the conv are tagged
# # 'tagged_msg_bc': if conv has at least one tagged msg, all the msgs of the conv are tagged
# relabeling = ['tagged_msg', 'tagged_predator', 'tagged_conv']
#
text_feature_sets = [["w2v_glove","prv_cat","nauthors", "time","count", "msg_line"]]
Baselines = [msg_classifier()]#text_features, [len(df_train), len(df_test)], relabeling, df_train_test)]#, conv_msg_classifier(relabeling)]

for text_feature_set in text_feature_sets:
text_feature_set_str = '.'.join(text_feature_set)
text_features = ef.extract_load_text_features(df_train_test, text_feature_set, f'../output/{text_feature_set_str}.npz')

for baseline in Baselines:
baseline.main(df_train_test, text_features, "../output/", text_feature_set_str)

# 'tagged_msg': original labels (conv, msg_line) only available for test set
# 'tagged_predator_bc': if conv has at least one predator, all the msgs of the conv are tagged
# 'tagged_msg_bc': if conv has at least one tagged msg, all the msgs of the conv are tagged
relabeling = ['tagged_msg', 'tagged_predator', 'tagged_conv']

#

0 comments on commit f9df87d

Please sign in to comment.