In [6]:
import os.path as osp
import pandas as pd
import numpy as np

from glob import glob
import sys
import importlib as imp
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
import json
import pickle as pc
from sklearn.feature_extraction import DictVectorizer


## Features

In [4]:
X_file = '../../external/transcendent/extended-features-X.json'
y_file = '../../external/transcendent/extended-features-y.json'
meta_file = '../../external/transcendent/extended-features-meta.json'

vectorizer = DictVectorizer(sparse=True, dtype=int)
with open(X_file, 'r') as infile:
    X=json.load(infile)
    X_t = vectorizer.fit_transform(X)
    
with open(y_file, 'r') as infile:
    y=json.load(infile)
    y = np.array(y)
    
with open(meta_file, 'r') as infile:
    meta=json.load(infile)
    meta_df = pd.DataFrame(meta)
    

In [None]:
X_vec = []
for data_point in X:
    features = {}
    for feature_name, feature_value in data_point.items():
        # Convert the feature to binary representation (e.g., 0 or 1)
        features[feature_name] = int(feature_value is not None)

    X_vec.append(features)
X_t = vectorizer.fit_transform(X_vec)


##Â Classifier

In [19]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

In [20]:
d1_train, d1_test, d2_train, d2_test, d_u = pc.load(open('all_indices.pkl', 'rb'))

In [21]:
X_train = X_t[d1_train]
y_train = y[d1_train]
X_test = X_t[d1_test]
y_test = y[d1_test]

classifier = LinearSVC()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.92




In [22]:
X_train = X_t[d2_train]
y_train = y[d2_train]
X_test = X_t[d2_test]
y_test = y[d2_test]

classifier = LinearSVC()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.985




In [24]:
# this part is for cross validation for concept drift
train_indices = meta_df[meta_df['year2']<2016].index

X_train = X_t[train_indices]
y_train = y[train_indices]

# Single pass
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# classifier = LinearSVC()
# classifier.fit(X_train, y_train)
# y_val_pred = classifier.predict(X_val)
# accuracy = accuracy_score(y_val, y_val_pred)
# print(f"Validation accuracy: {accuracy}")

# Cross validation
classifier = LinearSVC()
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5)
mean_accuracy = np.mean(cv_scores)
print(f"Mean Cross-Validation Accuracy: {mean_accuracy}")
classifier.fit(X_train, y_train)

for year in [2016, 2017, 2018]:
    test_indices = meta_df[~(meta_df['year2'] == year)].index
    X_test = X_t[test_indices]
    y_test = y[test_indices]
    y_pred = classifier.predict(X_test)

    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{year} accuracy: {accuracy}")



Mean Cross-Validation Accuracy: 0.9820619823073992




2016 accuracy: 0.9406888660029752
2017 accuracy: 0.955413216684893
2018 accuracy: 0.9594267955801105


In [75]:
train_indices = meta_df[(meta_df['google'] == 1) & (meta_df['china'] == 0)].index

X_train = X_t[train_indices]
y_train = y[train_indices]

# Single pass
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# classifier = LinearSVC()
# classifier.fit(X_train, y_train)
# y_val_pred = classifier.predict(X_val)
# accuracy = accuracy_score(y_val, y_val_pred)
# print(f"Validation accuracy: {accuracy}")

# Cross validation
classifier = LinearSVC()
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5)
mean_accuracy = np.mean(cv_scores)
print(f"Mean Cross-Validation Accuracy: {mean_accuracy}")
classifier.fit(X_train, y_train)

test_indices1 = meta_df[(meta_df['google'] == 0) & (meta_df['china'] == 1)].index
test_indices2 = meta_df[(meta_df['google'] == 1) & (meta_df['china'] == 1)].index
for test_indices in [test_indices1, test_indices2]:
    X_test = X_t[test_indices]
    y_test = y[test_indices]

    # Make predictions on the test data
    y_pred = classifier.predict(X_test)

    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")



Mean Cross-Validation Accuracy: 0.9834963481826335




Accuracy: 0.7024722932651322
Accuracy: 0.9709090909090909
