In [1]:
import os
import tqdm
import ppscore
import numpy as np
import pandas as pd

In [2]:
path = "../data/causal/public/"
items = os.listdir(path)

In [3]:
df = pd.read_csv("../data/causal/public/ground_truth.csv", names=["label"])

In [4]:
df

Unnamed: 0,label
0,1
1,1
2,0
3,1
4,1
...,...
995,0
996,0
997,1
998,0


In [21]:
dataset = []

for i in tqdm.trange(1000):
    p = pd.read_csv(f"../data/causal/public/{i}.csv", names=["x", "y"])
    res1 = ppscore.score(p, "x", "y")
    res2 = ppscore.score(p, "y", "x")
    
    dataset.append((
        res1["ppscore"],
        res1["baseline_score"],
        res1["model_score"],
        res2["ppscore"],
        res2["baseline_score"],
        res2["model_score"],
        res1["ppscore"] - res2["ppscore"],
        res1["model_score"] - res2["model_score"],
        res1["baseline_score"] - res2["baseline_score"],
        p["x"].mean(),
        p["x"].std(),
        p["y"].mean(),
        p["y"].std(),
    ))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:14<00:00, 71.06it/s]


In [36]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [22]:
poly = PolynomialFeatures()
dataset = poly.fit_transform(X=dataset)

In [23]:
dataset.shape

(1000, 105)

In [13]:
from sklearn.model_selection import KFold

In [39]:
kf = KFold(n_splits=10)
p = []

for train_index, test_index in kf.split(dataset):
    X_train, X_test = dataset[train_index], dataset[test_index]
    y_train, y_test = df.label.values[train_index], df.label.values[test_index]
    
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    res = clf.score(X_test, y_test)
    
    p.append(res)
    
print(np.mean(p))

0.8630000000000001


In [46]:
kf = KFold(n_splits=10)
p = []

for train_index, test_index in kf.split(dataset):
    X_train, X_test = dataset[train_index], dataset[test_index]
    y_train, y_test = df.label.values[train_index], df.label.values[test_index]
    
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    res = clf.score(X_test, y_test)
    
    p.append(res)
    
print(np.mean(p))

0.898


In [47]:
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(dataset, df.label.values)

AdaBoostClassifier(n_estimators=100)

In [48]:
testing = []

for i in tqdm.trange(1000):
    p = pd.read_csv(f"../data/causal/private/{i}.csv", names=["x", "y"])
    res1 = ppscore.score(p, "x", "y")
    res2 = ppscore.score(p, "y", "x")
    
    testing.append((
        res1["ppscore"],
        res1["baseline_score"],
        res1["model_score"],
        res2["ppscore"],
        res2["baseline_score"],
        res2["model_score"],
        res1["ppscore"] - res2["ppscore"],
        res1["model_score"] - res2["model_score"],
        res1["baseline_score"] - res2["baseline_score"],
        p["x"].mean(),
        p["x"].std(),
        p["y"].mean(),
        p["y"].std(),
    ))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:14<00:00, 69.73it/s]


In [49]:
pred = clf.predict(poly.transform(testing))

In [50]:
submission = pd.DataFrame(data=pred, columns=["direction"])

In [51]:
submission = submission.reset_index().rename(columns={"index": "id"})

In [52]:
submission.to_csv("../predictions/t4_ppscore_ploy.csv", index=False)