# Starter notebook
### This is a notebook to get you started on the Correlaid X Challenge together with Buildup.
It expects you to copy the data (`phoenix_tensions.csv` and `phoenix_tensions_appendix.csv`) into the same folder as this notebook.

In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

### Get data

In [3]:
# This notebook expects the phoenix_tensions.csv file to be in the same directory
df_raw = pd.read_csv("phoenix_tensions.csv")

# The appendix contains information that you can use, but you can't expect will be in the holdout set.
# It also contains the rest of the tensions. You don't need to predict these. 
df_appendix = pd.read_csv("phoenix_tensions_appendix.csv")

# We'll expect the output notebook or model to be able to add in a test_df instead of getting 
# it from train_test_split
# test_df = pd.read_csv("phoenix_tensions_holdout.csv")

In [4]:
# Make your own test_df for for validation to compare the model's performance 
# on data it hasn't been trained on.
df, test_df = train_test_split(
    df_raw, 
    test_size=0.2, 
    random_state=2021, 
)

In [5]:
COL_ORDER = df.filter(like="is_").columns

## Baseline model using the annotations from the training data

This is quite horribly written and has a double loop in the predict which makes it slow so 
should probably be cleaned up etc

In [6]:
df_annotations = pd.merge(df, df_appendix, on="object_id")

In [7]:
TENSIONS = [
    "economic_labour_tension",
    "political_tension",
    "service_related_tension",
    "community_insecurity_tension",
]

all_feats = []
for tension in TENSIONS:
    feats = df_annotations[~df_annotations[f"{tension}_features"].isnull()][f"{tension}_features"]
    if len(feats) > 0:
        feats = pd.DataFrame({"feature": feats.str.split(",").explode()})
        feats["tension"] = tension
        all_feats.append(feats)
feats = pd.concat(all_feats, ignore_index=True, axis=0)
feats = {f["feature"]: f["tension"] for _, f in feats.iterrows()}


class Baseline(BaseEstimator, ClassifierMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            out = {f"is_{t}": False for t in TENSIONS}
            for feat, tension in self.features.items():
                if feat in row["text"]:
                    out[f"is_{tension}"] = True
            predictions.append(out)
        return pd.DataFrame(predictions)[COL_ORDER].values

In [8]:
def test(model, df):
    preds = model.predict(df)
    return metrics.f1_score(df[COL_ORDER].values, preds, average="macro")

In [9]:
model = Baseline(feats)

test(model, df), test(model, test_df)

(0.08444991747911457, 0.0)

## Quick and dirty Random Forest classifier using the topics

In [10]:
class CategoricalColumns(BaseEstimator, TransformerMixin):
    """CategoricalColumns is a quick and dirty One Hot Encoder transformer for the topics column."""
    def fit(self, df, y=None):
        topics = self._to_array(df["topics"])
        columns = topics.explode().unique()
        self.columns = [c.strip() for c in columns]
        return self
        
    def transform(self, df):
        out = {}
        for col in self.columns:
            out[col] = df["topics"].str.contains(col).astype(int)
        return pd.DataFrame(out)
        
    def _to_array(self, series):
        """Clean up a series to """
        remove = ["[", "]", "'"]
        for char in remove:
            series = series.str.replace(char, "", regex=False)
        return series.str.split(",")

In [11]:
#CategoricalColumns returns this one-hot encoding.
CategoricalColumns().fit_transform(df)

Unnamed: 0,other,lebanese army/security forces,religious figure,covid / vaccination,beirut,economic collapse,education,foreign countries,lebanese revolution,north lebanon,...,occupied palestine,false news,call for protest,aid,external affairs/diplomacy,offensive language,international organizations / un,south lebanon,lebanese judiciary,asad regime
1409,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
963,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1450,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1174,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
452,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1152,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1365,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
forest = RandomForestClassifier(random_state=2021, class_weight="balanced")
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

model = Pipeline(
    [
        ("categories", CategoricalColumns()),
        ("random_forest", multi_target_forest)
    ]
)

model = model.fit(df, df[COL_ORDER].values)

test(model, df), test(model, test_df)

(0.23948309102198564, 0.20484195032733407)