## Packages

- `pandas` data management
- `nltk` old NLP
- `matplotlib` vizualisation
- `scikit-learn` machine learning

## Data

Open Alex Computational Social Science

In [None]:
#!pip install pandas nltk scikit-learn matplotlib

In [None]:
import pandas as pd
df = pd.read_csv("./sample_data/CSS_exact_openalex.csv")
print(df["abstract"].isna().sum())
df = df.dropna(subset=["abstract"])
df.shape


759


(690, 182)

In [None]:
df["abstract"]

0       14,0642,033MetricsTotal Downloads14,064Last 6 ...
1       The increasing integration of technology into ...
3       The integration of social science with compute...
7       Abstract Large language models (LLMs) are capa...
9       In the first part of the paper, the field of a...
                              ...                        
1436    Area Studies and the Challenges of Creating a ...
1439    Welcome to the third issue of IASSIST Quarterl...
1441    My essay has several connected histories to un...
1447    Citation (2020), "Index", Härtel, C.E.J., Zerb...
1448    Citation (2023), "Index", Lytras, M.D., Housaw...
Name: abstract, Length: 690, dtype: object

## Cleaning the data

Some abstract have HTML tags, let's remove them

In [None]:
import re
re.sub(r"<.*?>", "", "This is a <b>bold</b> statement.").strip()

'This is a bold statement.'

Cleaning function

In [None]:
def clean_text(text):
    """
    Fonction qui nettoie le texte en supprimant les balises HTML et les espaces inutiles
    :param text: le texte à nettoyer
    :return: le texte nettoyé
    """
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s\s+", " ", text)
    # text = text.capitalize()
    return text.strip()

Apply to the corpus

In [None]:
df["text"] = df["abstract"].apply(clean_text)

Unnamed: 0,text,abstract
0,"14,0642,033MetricsTotal Downloads14,064Last 6 ...","14,0642,033MetricsTotal Downloads14,064Last 6 ..."
1,The increasing integration of technology into ...,The increasing integration of technology into ...
3,The integration of social science with compute...,The integration of social science with compute...
7,Abstract Large language models (LLMs) are capa...,Abstract Large language models (LLMs) are capa...
9,"In the first part of the paper, the field of a...","In the first part of the paper, the field of a..."


## Word scale

### If a word is in the abstract

Les bases de la fouille de données. Quels sont les questions qui parlent d'intelligence artificielle ?

In [None]:
filter = df["text"].str.contains("LLM")
filter.sum()


np.int64(40)

### Tokenization

#### with regex

In [None]:
import re
word_pattern = r"\w+"
tokens = re.findall(word_pattern, "This is a test")
tokens

['This', 'is', 'a', 'test']

In [None]:
df["text"].apply(lambda x: re.findall(r"\w+",x.lower()))

0       [14, 0642, 033metricstotal, downloads14, 064la...
1       [the, increasing, integration, of, technology,...
3       [the, integration, of, social, science, with, ...
7       [abstract, large, language, models, llms, are,...
9       [in, the, first, part, of, the, paper, the, fi...
                              ...                        
1436    [area, studies, and, the, challenges, of, crea...
1439    [welcome, to, the, third, issue, of, iassist, ...
1441    [my, essay, has, several, connected, histories...
1447    [citation, 2020, index, härtel, c, e, j, zerbe...
1448    [citation, 2023, index, lytras, m, d, housawi,...
Name: text, Length: 690, dtype: object

### with a library `nltk`

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

word_tokenize("This is a test")

[nltk_data] Downloading package punkt to /Users/emilien/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['This', 'is', 'a', 'test']

In [None]:
df["text"].apply(word_tokenize)

0       [14,0642,033MetricsTotal, Downloads14,064Last,...
1       [The, increasing, integration, of, technology,...
3       [The, integration, of, social, science, with, ...
7       [Abstract, Large, language, models, (, LLMs, )...
9       [In, the, first, part, of, the, paper, ,, the,...
                              ...                        
1436    [Area, Studies, and, the, Challenges, of, Crea...
1439    [Welcome, to, the, third, issue, of, IASSIST, ...
1441    [My, essay, has, several, connected, histories...
1447    [Citation, (, 2020, ), ,, ``, Index, '', ,, Hä...
1448    [Citation, (, 2023, ), ,, ``, Index, '', ,, Ly...
Name: text, Length: 690, dtype: object

### Counting the words

First with basic tools

In [None]:
from collections import Counter

In [None]:
counter = Counter([j for i in list(df["text"].apply(word_tokenize)) for j in i])
counter.most_common(20)

[(',', 9587),
 ('the', 6295),
 ('of', 5746),
 ('and', 5602),
 ('.', 5119),
 ('to', 3330),
 ('in', 2875),
 ('a', 2347),
 ('social', 1860),
 (')', 1459),
 ('for', 1447),
 ('(', 1439),
 ('that', 1281),
 ('on', 1257),
 ('data', 1145),
 ('is', 1058),
 ('as', 928),
 ('with', 854),
 ('science', 852),
 ('computational', 838)]

Removing the stop words

In [None]:
nltk.download("stopwords")

from nltk.corpus import stopwords
import string

english_stopwords = list(set(stopwords.words("english"))) + list(string.punctuation) + ["``", "''", "``", "’", "“", "”", "—", "–"]
english_stopwords[0:10]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emilien/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['at', 'for', 'up', 'needn', 'this', 'aren', 'while', 'to', "we've", 'who']

In [None]:
counter = Counter([j for i in list(df["text"].apply(word_tokenize)) for j in i if j.lower() not in english_stopwords])
counter.most_common(20)

[('social', 1860),
 ('data', 1145),
 ('science', 852),
 ('computational', 838),
 ('research', 723),
 ('analysis', 408),
 ('media', 402),
 ('methods', 337),
 ('study', 315),
 ('information', 299),
 ('models', 288),
 ('new', 287),
 ('digital', 287),
 ('Social', 263),
 ('work', 256),
 ('also', 254),
 ('use', 252),
 ('using', 251),
 ('model', 235),
 ('political', 229)]

### Most frequent words combinations

Bigrams and trigrams

In [None]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

def generate_bigrams_nltk(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in english_stopwords]
    bigrams = list(ngrams(tokens, 2))
    return bigrams

generate_bigrams_nltk(df["text"].iloc[1])[0:10]

[('increasing', 'integration'),
 ('integration', 'technology'),
 ('technology', 'lives'),
 ('lives', 'created'),
 ('created', 'unprecedented'),
 ('unprecedented', 'volumes'),
 ('volumes', 'data'),
 ('data', 'society'),
 ('society', "'s"),
 ("'s", 'everyday')]

Couting them

In [None]:
counter = Counter([j for i in list(df["text"].apply(generate_bigrams_nltk)) for j in i])
counter.most_common(20)

[(('social', 'science'), 737),
 (('computational', 'social'), 730),
 (('social', 'media'), 297),
 (('social', 'sciences'), 173),
 (('big', 'data'), 129),
 (('machine', 'learning'), 85),
 (('data', 'science'), 62),
 (('computational', 'methods'), 60),
 (('social', 'networks'), 58),
 (('science', 'research'), 58),
 (('natural', 'language'), 55),
 (('social', 'network'), 52),
 (('language', 'processing'), 50),
 (('large', 'language'), 47),
 (('language', 'models'), 47),
 (('field', 'computational'), 46),
 (('science', 'css'), 44),
 (('social', 'scientists'), 44),
 (('media', 'data'), 43),
 (('smart', 'cities'), 41)]

## Representing texts

### Manually

In [None]:
df["dim1"] = df["text"].str.contains("LLM")
df["dim2"] = df["text"].str.contains("IA")
df["dim3"] = df["text"].str.contains("algorithm")
df[["dim1", "dim2", "dim3"]]

Unnamed: 0,dim1,dim2,dim3
0,False,False,False
1,False,False,False
3,False,False,False
7,True,False,False
9,False,False,False
...,...,...,...
1436,False,False,False
1439,False,True,False
1441,False,False,True
1447,False,True,True


### Using `scikit-learn` to create the DTM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=english_stopwords, ngram_range=(1, 1), max_features=300)

dfm = vectorizer.fit_transform(df["text"])

# shape it
dtm = pd.DataFrame(
        dfm.toarray(),
        columns=vectorizer.get_feature_names_out(),
    )

dtm.head()

Unnamed: 0,10,19,2020,abstract,across,address,age,agent,ai,al,...,web,well,within,without,word,words,work,world,years,yet
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### More advanced version

- Term Frequency-Inverse Document Frequency

$$\text{TF-IDF}(t, d, D) = \left( \frac{f_{t,d}}{n_d} \right) \times \log \left(\frac{N}{\text{df}_t} \right)
$$

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# créer un objet
vectorizer = TfidfVectorizer(stop_words=english_stopwords,
                             ngram_range=(1, 1),
                             max_features=300)

# applique
X = vectorizer.fit_transform(df["text"])

# mettre en forme
X = pd.DataFrame(X.toarray(),columns=list(vectorizer.get_feature_names_out()))
X.head()

Unnamed: 0,10,19,2020,abstract,across,address,age,agent,ai,al,...,web,well,within,without,word,words,work,world,years,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.145328,0.0,0.0,0.0,0.126847,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.184266,0.0,0.0,0.0,0.160833,0.0,0.0,0.0
3,0.0,0.0,0.0,0.077291,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.079879,0.0,0.0,0.058495,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199179,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Most important words for a document

In [None]:
X.loc[12].idxmax()

'data'

## Distance between texts

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

X = vectorizer.fit_transform(df["text"])
cosine_similarity(X[0], X[100])

array([[0.]])

In [None]:
distances = pd.DataFrame(pairwise_distances(X, metric="cosine"))

In [None]:
distances[10].idxmax()

0