In [None]:
cd ../..

In [77]:
# If True, we use `HashingVectorizer`, otherwise we use a `TfidfVectorizer`
USE_HASHING = False
#USE_HASHING = True

# Number of features used by `HashingVectorizer`
N_FEATURES = 2**16

# Optional feature selection: either False, or an integer: the number of
# features to select
SELECT_CHI2 = False

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

data_train = fetch_20newsgroups(
    subset="train", categories=categories, shuffle=True, random_state=42
)

data_test = fetch_20newsgroups(
    subset="test", categories=categories, shuffle=True, random_state=42
)
print("data loaded")

In [None]:
data_train.data

In [None]:
import pandas as pd

In [None]:
df_corpus = pd.read_csv('data/corpus/220726-RecueilComplet.csv')

In [None]:
patat_train = {}

In [55]:
df_corpus['text'] = df_corpus['title'] + '\n' + df_corpus['article']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corpus['text'] = df_corpus['title'] + '\n' + df_corpus['article']


In [60]:
df_work = df_corpus[['text','numsite']]

In [71]:
df_work = df_work[df_work['text'].notna()]

In [None]:
site_dic ={}
num_site = 0
for site in set(df_corpus['site']):
    site_dic[site] = num_site
    num_site += 1

In [83]:
from sklearn.model_selection import train_test_split

data_X = df_work['text']
data_y = df_work['numsite']

data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(
    data_X, data_y, test_size=0.33, random_state=42)


In [84]:
data_X_train.values

array(["Message du CSCOR\nMesdames et Messieurs, chers amis du CSCOR! Nous traversons une période de lourdes épreuves pour les peuples frères russe et ukrainien. Les forces nationalistes agressives qui ont pris le pouvoir à Kiev en 2014 suite à un coup d'état anticonstitutionnel ont imposé à la société ukrainienne une idéologie du néonazisme et ont déclenché une politique de répressions contre les citoyens qui n’acceptaient pas leurs actes, ce qui a conduit à la tragédie dans le Donbass et à la sécession de la Crimée. La Russie a été déclarée l'ennemi principal, la propagande haineuse contre tout le russe a été lancée, la discrimination linguistique et religieuse a été imposée en Ukraine. Des efforts systémiques malveillants ont été entrepris pour diviser les peuples ukrainien et russe, pour les dresser l’un contre l’autre. Nous avons constaté avec angoisse que cette théorie et cette pratique ignobles sont perçues dans les pays occidentaux comme une nouvelle normalité, une nouvelle mor

In [85]:
# order of labels in `target_names` can be different from `categories`
target_names = site_dic.keys()


def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


data_train_size_mb = size_mb(data_X_train.values)
data_test_size_mb = size_mb(data_X_test.values)

print(
    "%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)
)
print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb))
print("%d categories" % len(target_names))

2034 documents - 2.301MB (training set)
1353 documents - 1.096MB (test set)
34 categories


In [86]:
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

t0 = time()

if USE_HASHING:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False, n_features=N_FEATURES
    )
    X_train = vectorizer.transform(data_X_train)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=[])
    X_train = vectorizer.fit_transform(data_X_train)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)

done in 0.171248s at 13.438MB/s
n_samples: 221, n_features: 28961


In [87]:
t0 = time()
X_test = vectorizer.transform(data_X_test)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)

done in 0.095424s at 11.489MB/s
n_samples: 110, n_features: 28961


In [88]:
if USE_HASHING:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names_out()

In [89]:
from sklearn.feature_selection import SelectKBest, chi2

if SELECT_CHI2:
    print("Extracting %d best features by a chi-squared test" % SELECT_CHI2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=SELECT_CHI2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names is not None:
        # keep selected feature names
        feature_names = feature_names[ch2.get_support()]
    print("done in %fs" % (time() - t0))
    print()

In [90]:
import numpy as np
from sklearn import metrics
from sklearn.utils.extmath import density


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


def benchmark(clf):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, "coef_"):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split("(")[0]
    return clf_descr, score, train_time, test_time

In [91]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier


results = []
for clf, name in (
    (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(), "Random forest"),
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print("=" * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty)))

# Train SGD with Elastic Net penalty
print("=" * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet"))
)

# Train NearestCentroid without threshold
print("=" * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print("=" * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=0.01)))
results.append(benchmark(BernoulliNB(alpha=0.01)))
results.append(benchmark(ComplementNB(alpha=0.1)))

print("=" * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(
    benchmark(
        Pipeline(
            [
                (
                    "feature_selection",
                    SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
                ),
                ("classification", LinearSVC(penalty="l2")),
            ]
        )
    )
)

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(solver='sag', tol=0.01)




train time: 0.257s
test time:  0.002s
accuracy:   0.609
dimensionality: 28961
density: 1.000000
top 10 keywords per class:
lalettrepatriote.com: frdocumentationforums supportvos écoutez recevons wordp...
www.profession-gendarme.com: 136746 plagiat triché 89673468 commande empoison...
www.dreuz.info: louée spam confiée bouton suisse telegram votre cliquez récep...
lemediaen442.fr: sujet pass marcel poutine 02 05 politiquesanté média meme 03
www.lelibrepenseur.org: adhanom admin uploads piratage policières mp4 dagricu...
www.wikistrike.com: articles repost0 derniers repost1 newsletter partager ins...
fr.sott.net: reddition otan dépense eurovision attaque crimée prof koenig tra...
www.francesoir.fr: france état dont rapporte risques emmanuel macron liberté ...
fr.novopress.info: tweets réfugiés réinformation immigration entretiens techn...
www.nouvelordremondial.cc: actu 2007 192 procuration 2012 2010 2011 blog 2008...
www.les-crises.fr: merci ratez salaires géopolitique fed lobbys crises

IndexError: index 31 is out of bounds for axis 0 with size 31

In [None]:
import urllib
def get_site(url):
    return urllib.parse.urlparse(url).netloc

In [None]:
df_corpus = df_corpus[df_corpus['url'].notna()]

In [None]:
df_corpus['site'] = df_corpus['url'].apply(get_site)

In [None]:
def get_numsite(site):
    if site in site_dic:
        return site_dic[site]
    else:
        index = len(site_dic.keys())
        site_dic[site]=index
        return index

In [None]:
site_dic

In [None]:
df_corpus['numsite'] = df_corpus['site'].apply(get_numsite)

In [108]:
df_work['sample'] = False
df_sample = df_work.sample(frac=0.3, random_state = 42)

In [110]:
df_sample[df_sample['sample']]



Unnamed: 0,text,numsite,sample
25,"Automobile. Devenues obligatoires, quel rôle v...",25,True
310,Un groupe secret financé par Soros travaille e...,5,True
73,Des éoliennes mises sous surveillance suite à ...,7,True
222,>Mondialisation.ca\n\n\n\n\n\n\nAccueil\nA pro...,33,True
57,Vrais mensonges et fausses vérités: les nouvea...,7,True
...,...,...,...
119,La traçabilité des fonds vers l’Ukraine se per...,7,True
15,Législatives en Loire-Atlantique. Qui est Laur...,25,True
6,"QI, races, différences, intelligence, délinqua...",25,True
289,Le racisme anti-Blancs russe et la droite fran...,15,True


In [118]:
list_index = list(df_work.index)

In [160]:
import random
samples = random.sample(list_index,int(len(list_index)*0.5))

In [161]:
df_work['sample'] = False
for i in samples:
    df_work.loc[i,'sample'] = True

In [162]:
for index,row in df_work.iterrows():
    print(index,index in samples)
    row['sample'] = (index in samples)

0 True
1 True
2 True
3 True
4 False
5 True
6 False
7 True
8 False
9 False
10 False
11 False
12 False
13 True
14 False
15 True
16 True
17 False
18 True
19 True
20 True
21 True
22 False
23 True
24 False
25 True
26 True
27 False
28 False
29 True
30 False
31 True
32 True
33 False
34 True
35 True
36 True
37 False
38 False
39 True
40 True
41 False
42 False
43 False
44 False
45 False
46 True
47 False
48 True
49 False
50 True
51 False
52 False
53 False
54 False
55 True
56 True
57 True
58 True
59 True
60 False
61 False
62 True
63 True
64 False
65 False
66 True
67 False
68 False
69 True
70 True
71 False
72 True
73 True
74 True
75 True
76 True
77 False
78 True
79 False
80 True
81 False
82 False
83 True
84 False
85 True
86 True
87 True
88 False
89 False
90 False
91 True
92 False
93 False
94 True
95 True
96 True
97 False
98 False
99 False
100 True
101 True
102 False
103 False
104 True
105 False
106 False
107 False
108 False
109 False
110 True
111 True
112 False
113 False
114 False
115 True
116 True

In [157]:
df_work

Unnamed: 0,text,numsite,sample
0,Essais Pfizer sur le vaccin anti-covid : le ra...,26,False
1,Tests – vaccins – pass : fabrique de la paniqu...,25,False
2,"Covid-19. La pandémie et les vaccins, pour mie...",25,False
3,Covid-19. Une épidémie quasiment terminée avan...,25,False
4,"Piscine, voile, ramadan : scènes de la vie ord...",25,False
...,...,...,...
330,Donbass: Deux nouveaux laboratoires militaires...,5,False
331,Pfizer vient d’être condamné à 75 millions dol...,5,False
332,L’Italie se révolte contre l’Union Européenne ...,5,False
333,Variole du singe: Comme si les terriens étaien...,5,False
