# First tries
## Imports

In [1]:
import os
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import codecs
from os import path
import csv
import networkx as nx
import numpy as np
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

## Load data

In [2]:
# Read training data
with open("train.csv", 'r') as f:
    train_data = f.read().splitlines()

train_hosts = list()
y_train = list()
for row in tqdm(train_data):
    host, label = row.split(",")
    train_hosts.append(host)
    y_train.append(label.lower())

# Read test data
with open("test.csv", 'r') as f:
    test_hosts = f.read().splitlines()

# Create a directed, weighted graph
G = nx.read_weighted_edgelist('edgelist.txt', create_using=nx.DiGraph())

print(G.number_of_nodes())
print(G.number_of_edges())

# Load the textual content of a set of webpages for each host into the dictionary "text". 
# The encoding parameter is required since the majority of our text is french.
text = dict()
filenames = os.listdir('text/text')
for filename in tqdm(filenames):
    with codecs.open(path.join('text/text/', filename), encoding='latin-1') as f: 
        text[filename] = f.read().replace("\n", "").lower()

train_data = list()
for host in tqdm(train_hosts):
    if host in text:
        train_data.append(text[host])
    else:
        train_data.append('')

100%|███████████████████████████████████████████████████████████████████████████████| 2125/2125 [00:00<00:00, 708609.95it/s]


28002
319498


100%|███████████████████████████████████████████████████████████████████████████████| 28003/28003 [00:19<00:00, 1462.62it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2125/2125 [00:00<00:00, 1063718.34it/s]


## Text approach only

In [7]:
# Create the training matrix. Each row corresponds to a web host and each column to a word present in at least 10 web
# hosts and at most 1000 web hosts. The value of each entry in a row is equal to the tf-idf weight of that word in the 
# corresponding web host       

vec = TfidfVectorizer(
    decode_error='ignore', strip_accents='unicode', encoding='latin-1', 
    min_df=10, max_df=1000)
X_train = vec.fit_transform(train_data)

# Get textual content of web hosts of the test set
test_data = list()
for host in tqdm(test_hosts):
    if host in text:
        test_data.append(text[host])
    else:
        test_data.append('')

# Create the test matrix following the same approach as in the case of the training matrix
X_test = vec.transform(test_data)

print("Train matrix dimensionality: ", X_train.shape)
print("Test matrix dimensionality: ", X_test.shape)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000, verbose=5)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)

# Write predictions to a file
with open('text_baseline.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

100%|█████████████████████████████████████████████████████████████████████████████████| 560/560 [00:00<00:00, 560307.79it/s]


Train matrix dimensionality:  (2125, 21384)
Test matrix dimensionality:  (560, 21384)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.8min finished


## Graph approach only

In [9]:
# Create the training matrix. Each row corresponds to a web host.
# Use the following 3 features for each web host (unweighted degrees)
# (1) out-degree of node
# (2) in-degree of node
# (3) average degree of neighborhood of node
X_train = np.zeros((len(train_hosts), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_hosts)
for i in range(len(train_hosts)):
    X_train[i,0] = G.in_degree(train_hosts[i])
    X_train[i,1] = G.out_degree(train_hosts[i])
    X_train[i,2] = avg_neig_deg[train_hosts[i]]

# Create the test matrix. Use the same 3 features as above
X_test = np.zeros((len(test_hosts), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_hosts)
for i in range(len(test_hosts)):
    X_test[i,0] = G.in_degree(test_hosts[i])
    X_test[i,1] = G.out_degree(test_hosts[i])
    X_test[i,2] = avg_neig_deg[test_hosts[i]]

print("Train matrix dimensionality: ", X_train.shape)
print("Test matrix dimensionality: ", X_test.shape)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=2000, verbose=True)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

# Write predictions to a file
with open('graph_baseline.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

Train matrix dimensionality:  (2125, 3)
Test matrix dimensionality:  (560, 3)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


## Mix Features

In [46]:
def dump_prediction(y_pred, name="baseline"):
    with open('{}.csv'.format(name), 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        lst = clf.classes_.tolist()
        lst.insert(0, "Host")
        writer.writerow(lst)
        for i,test_host in enumerate(test_hosts):
            lst = y_pred[i,:].tolist()
            lst.insert(0, test_host)
            writer.writerow(lst)

In [3]:
# Create the training matrix. Each row corresponds to a web host and each column to a word present in at least 10 web
# hosts and at most 1000 web hosts. The value of each entry in a row is equal to the tf-idf weight of that word in the 
# corresponding web host
vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', encoding='latin-1', min_df=10, max_df=1000)
X_train_text = vec.fit_transform(train_data)

# Get textual content of web hosts of the test set
test_data = list()
for host in tqdm(test_hosts):
    if host in text:
        test_data.append(text[host])
    else:
        test_data.append('')


# Create the test matrix following the same approach as in the case of the training matrix
X_test_text = vec.transform(test_data)
# Create the training matrix. Each row corresponds to a web host.
# Use the following 3 features for each web host (unweighted degrees)
# (1) out-degree of node
# (2) in-degree of node
# (3) average degree of neighborhood of node
X_train_graph = np.zeros((len(train_hosts), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_hosts)
for i in range(len(train_hosts)):
    X_train_graph[i,0] = G.in_degree(train_hosts[i])
    X_train_graph[i,1] = G.out_degree(train_hosts[i])
    X_train_graph[i,2] = avg_neig_deg[train_hosts[i]]

# Create the test matrix. Use the same 3 features as above
X_test_graph = np.zeros((len(test_hosts), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_hosts)
for i in range(len(test_hosts)):
    X_test_graph[i,0] = G.in_degree(test_hosts[i])
    X_test_graph[i,1] = G.out_degree(test_hosts[i])
    X_test_graph[i,2] = avg_neig_deg[test_hosts[i]]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 560/560 [00:00<?, ?it/s]


In [4]:
X_train_graph.shape, X_train_text.shape

((2125, 3), (2125, 21384))

In [12]:
type(X_train_graph), type(X_train_text)

(numpy.ndarray, scipy.sparse.csr.csr_matrix)

In [18]:
X_train = np.concatenate((X_train_graph, np.array(X_train_text.todense())), axis=1)

In [20]:
X_test = np.concatenate((X_test_graph, np.array(X_test_text.todense())), axis=1)

In [19]:
# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=2000, verbose=True)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_pred))

In [33]:
from sklearn.svm import SVC
# Use logistic regression to classify the webpages of the test set
clf_svc = SVC(verbose=True)
clf_svc.fit(X_train, y_train)
y_train_pred = clf_svc.predict(X_train)

[LibSVM]

AttributeError: predict_proba is not available when  probability=False

In [34]:
y_pred = clf_svc.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_pred))

  _warn_prf(average, modifier, msg_start, len(result))


                         precision    recall  f1-score   support

       business/finance       0.31      0.94      0.46       626
     education/research       1.00      0.00      0.01       209
          entertainment       0.35      0.10      0.15       579
         health/medical       0.00      0.00      0.00        92
             news/press       1.00      0.01      0.02        83
politics/government/law       0.43      0.09      0.15       200
                 sports       0.00      0.00      0.00        46
           tech/science       0.67      0.03      0.05       290

               accuracy                           0.32      2125
              macro avg       0.47      0.15      0.11      2125
           weighted avg       0.45      0.32      0.20      2125



In [35]:
from sklearn.ensemble import RandomForestClassifier
# Use logistic regression to classify the webpages of the test set
clf_rf = RandomForestClassifier(verbose=True)
clf_rf.fit(X_train, y_train)
y_train_pred = clf_rf.predict(X_train)
y_pred = clf_rf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   12.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


                         precision    recall  f1-score   support

       business/finance       0.97      0.98      0.97       626
     education/research       0.96      0.94      0.95       209
          entertainment       0.97      0.98      0.97       579
         health/medical       0.98      0.96      0.97        92
             news/press       0.95      0.90      0.93        83
politics/government/law       0.96      0.96      0.96       200
                 sports       0.98      0.98      0.98        46
           tech/science       0.96      0.94      0.95       290

               accuracy                           0.97      2125
              macro avg       0.97      0.96      0.96      2125
           weighted avg       0.97      0.97      0.97      2125



In [41]:
from sklearn.base import BaseEstimator

class OverCertainClassifier(BaseEstimator):
    def __init__(self):
        self.num_class = None
    
    def fit(self, X, y):
        self.num_class = len(np.unique(y))
    
    def predict_proba(self, X):
        return np.ones((X.shape[0], self.num_class))

In [42]:
dummy_clf = OverCertainClassifier()

In [43]:
dummy_clf.fit(X_train, y_train)

In [45]:
y_pred = dummy_clf.predict_proba(X_test)

In [47]:
dump_prediction(y_pred, "ones")