## Classifier - Try 1

Classify if a article has the Morality Frame or not using just the article as input.

In [1]:
import os

os.listdir(os.getcwd())

['FRISS_srl.pkl',
 'README.md',
 'notebooks',
 'chunks.pkl',
 'grid_search_metrics.csv',
 '.git',
 'assets',
 'friss',
 'models',
 '.ipynb_checkpoints',
 'data',
 '.gitignore',
 'frameaxis']

In [2]:
labels_path = "data/data/en/train-labels-subtask-2.txt"
articles_path = "data/data/en/train-articles-subtask-2/"

In [3]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep="\t")

# Rename the columns for easier processing
labels_df.columns = ["article_id", "frames"]


labels_df.head()

Unnamed: 0,article_id,frames
0,832959523,"Morality,Security_and_defense,Policy_prescript..."
1,833039623,"Political,Crime_and_punishment,External_regula..."
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq..."
3,814777937,"Political,Morality,Fairness_and_equality,Exter..."
4,821744708,"Policy_prescription_and_evaluation,Political,L..."


In [4]:
# A function to read the article text given its ID
def get_article_content(article_id):
    try:
        with open(f"{articles_path}/article{article_id}.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        return None

df = labels_df

# Apply the function to get the article content
df["content"] = df["article_id"].apply(get_article_content)

# Drop rows where content could not be found
df.dropna(subset=["content"], inplace=True)

df.head()


Unnamed: 0,article_id,frames,content
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...


In [10]:
# Split the frames column into a list of frames
df["frames_list"] = df["frames"].str.split(",")

# create for each frame a new column with the frame as name and 1 if the frame is present in the article and 0 if not
for frame in df["frames_list"].explode().unique():
    df[frame] = df["frames_list"].apply(lambda x: 1 if frame in x else 0)

df.head()

Unnamed: 0,article_id,frames,content,frames_list,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...,"[Morality, Security_and_defense, Policy_prescr...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...,"[Political, Crime_and_punishment, External_reg...",0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...,"[Political, Crime_and_punishment, Fairness_and...",0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...,"[Political, Morality, Fairness_and_equality, E...",1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...,"[Policy_prescription_and_evaluation, Political...",0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [27]:
df.head()

Unnamed: 0,article_id,frames,content,frames_list,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...,"[Morality, Security_and_defense, Policy_prescr...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...,"[Political, Crime_and_punishment, External_reg...",0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...,"[Political, Crime_and_punishment, Fairness_and...",0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...,"[Political, Morality, Fairness_and_equality, E...",1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...,"[Policy_prescription_and_evaluation, Political...",0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [32]:
frame_columns = ['Morality', 'Security_and_defense', 'Policy_prescription_and_evaluation', 
                     'Legality_Constitutionality_and_jurisprudence', 'Economic', 'Political', 
                     'Crime_and_punishment', 'External_regulation_and_reputation', 'Public_opinion', 
                     'Fairness_and_equality', 'Capacity_and_resources', 'Quality_of_life', 
                     'Cultural_identity', 'Health_and_safety']

frames_count = df[frame_columns].T

In [33]:
frames_count.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,422,423,424,425,426,427,428,429,430,431
Morality,1,0,0,1,0,0,0,1,1,0,...,1,0,1,0,0,0,1,0,0,0
Security_and_defense,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Policy_prescription_and_evaluation,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
Legality_Constitutionality_and_jurisprudence,1,1,1,0,1,1,0,1,1,0,...,1,1,1,1,1,1,0,1,1,1
Economic,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [35]:
# each row is a frame and each col an article. If the frame is present in the article the value is 1, if not 0
frames_count["total"] = frames_count.sum(axis=1)
frames_count = frames_count["total"]

In [36]:
# return df with absolut and relativ frequency of frames
frames_count = pd.DataFrame(frames_count)
frames_count.columns = ["absolut"]
frames_count["relativ"] = frames_count["absolut"] / len(df)
# formazt the relativ frequency with %
frames_count["relativ"] = frames_count["relativ"].apply(lambda x: "{:.2%}".format(x))
frames_count.sort_values(by="absolut", ascending=False)

frames_count

Unnamed: 0,absolut,relativ
Morality,202,46.76%
Security_and_defense,187,43.29%
Policy_prescription_and_evaluation,65,15.05%
Legality_Constitutionality_and_jurisprudence,202,46.76%
Economic,28,6.48%
Political,234,54.17%
Crime_and_punishment,227,52.55%
External_regulation_and_reputation,120,27.78%
Public_opinion,23,5.32%
Fairness_and_equality,114,26.39%
